blob: f14dd4c47166c69faa303ccb32b4eb814704a666 [file] [log] [blame]
/*@z36.c:Hyphenation: Declarations@*******************************************/
/* */
/* THE LOUT DOCUMENT FORMATTING SYSTEM (VERSION 3.24) */
/* COPYRIGHT (C) 1991, 2000 Jeffrey H. Kingston */
/* */
/* Jeffrey H. Kingston (jeff@cs.usyd.edu.au) */
/* Basser Department of Computer Science */
/* The University of Sydney 2006 */
/* AUSTRALIA */
/* */
/* This program is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either Version 2, or (at your option) */
/* any later version. */
/* */
/* This program is distributed in the hope that it will be useful, */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
/* GNU General Public License for more details. */
/* */
/* You should have received a copy of the GNU General Public License */
/* along with this program; if not, write to the Free Software */
/* Foundation, Inc., 59 Temple Place, Suite 330, Boston MA 02111-1307 USA */
/* */
/* FILE: z36.c */
/* MODULE: Hyphenation */
/* EXTERNS: Hyphenate() */
/* */
/*****************************************************************************/
#include "externs.h"
#define NODE_MULT 4 /* what to multiply node indexes by */
#define MAX_CHAR 256 /* max chars represented in one char */
#define TRIE_MAGIC 5361534
#define KILL_CLASS 0 /* characters preventing hyphenation */
#define PUNCT_CLASS 1 /* characters delimiting hyphenation */
/*****************************************************************************/
/* */
/* static tex_codes[] */
/* */
/* TeX hyphenation files often contain escape sequences consisting of a */
/* backslash and two or three characters to denote 8-bit characters. This */
/* code will read and translate such sequences if they are in the following */
/* list. */
/* */
/*****************************************************************************/
static char *tex_codes[] = {
"Agrave", "`A", "\300",
"Aacute", "'A", "\301",
"Acircumflex", "^A", "\302",
"Atilde", "~A", "\303",
"Adieresis", "\"A", "\304",
"agrave", "`a", "\340",
"aacute", "'a", "\341",
"acircumflex", "^a", "\342",
"atilde", "~a", "\343",
"adieresis", "\"a", "\344",
"ccedilla", "cc", "\347",
"Egrave", "`E", "\310",
"Eacute", "'E", "\311",
"Ecircumflex", "^E", "\312",
"Edieresis", "\"E", "\313",
"egrave", "`e", "\350",
"eacute", "'e", "\351",
"ecircumflex", "^e", "\352",
"edieresis", "\"e", "\353",
"Igrave", "`I", "\314",
"Iacute", "'I", "\315",
"Icircumflex", "^I", "\316",
"Idieresis", "\"I", "\317",
"igrave", "`\\i", "\354",
"iacute", "'\\i", "\355",
"icircumflex", "^\\i", "\356",
"idieresis", "\"\\i","\357",
"Ograve", "`O", "\322",
"Oacute", "'O", "\323",
"Ocircumflex", "^O", "\324",
"Otilde", "~O", "\325",
"Odieresis", "\"O", "\326",
"ograve", "`o", "\362",
"oacute", "'o", "\363",
"ocircumflex", "^o", "\364",
"otilde", "~o", "\365",
"odieresis", "\"o", "\366",
"Ugrave", "`U", "\331",
"Uacute", "'U", "\332",
"Ucircumflex", "^U", "\333",
"Udieresis", "\"U", "\334",
"ugrave", "`u", "\371",
"uacute", "'u", "\372",
"ucircumflex", "^u", "\373",
"udieresis", "\"u", "\374",
"", "", ""
};
static void DecodeEscapes(FULL_CHAR *str, FULL_CHAR *fname, int hline_num)
{ FULL_CHAR *p, *q;
int i;
p = q = str;
while( *q != '\0' )
{
if( *q == '\\' )
{ for( i = 0; tex_codes[i][0] != '\0'; i += 3 )
{
if( StringBeginsWith(q+1, AsciiToFull(tex_codes[i+1])) )
break;
}
if( tex_codes[i][0] != '\0' )
{
StringCopy(p, tex_codes[i+2]);
p += StringLength(AsciiToFull(tex_codes[i+2]));
q += StringLength(AsciiToFull(tex_codes[i+1])) + 1;
}
else
{
Error(36, 1, "in hyphenation file %s, unknown escape sequence (line %d)",
FATAL, no_fpos, fname, hline_num);
}
}
else *p++ = *q++;
}
*p++ = '\0';
} /* end DecodeEscapes */
/*****************************************************************************/
/* */
/* static TRIE HyphTables[] */
/* */
/* The packed hyphenation tables, indexed by language. An entry is NULL */
/* when the table for that language has not yet been read in; TriedFile */
/* is TRUE after we have tried to read that file, whether or not we were */
/* successful. */
/* */
/*****************************************************************************/
typedef struct trie_rec
{ int magic; /* a magic number to make sure ok */
int class_count; /* the number of character classes */
unsigned char class[MAX_CHAR]; /* the character classes */
short *node_mem; /* the node memory */
int node_lim; /* top of node memory */
int node_free; /* first free space in node memory */
FULL_CHAR *string_mem; /* the string memory */
int string_lim; /* top of string memory */
int string_first; /* the first (last inserted) string */
} *TRIE;
static TRIE HyphTables[MAX_LANGUAGE] = { NULL };
static BOOLEAN TriedFile[MAX_LANGUAGE] = { FALSE };
/*@::CompressValue(), UncompressValue()@**************************************/
/* */
/* CompressValue(p, q) */
/* */
/* Compress value string p, placing the result in q. */
/* */
/*****************************************************************************/
#define FirstHalf(y) ( (y) >> 4 )
#define LastHalf(y) ( (y) & 15 )
#define AssignFirstHalf(x, y) ( (x) = ((y) << 4) )
#define AssignLastHalf(x, y) ( (x) |= (y) )
#define CompressValue(compressed, uncompressed) \
/* FULL_CHAR *compressed, *uncompressed; */ \
{ register FULL_CHAR *p, *q; \
p = compressed; q = uncompressed; \
for( ; ; ) \
{ \
if( *q == (FULL_CHAR) '\0' ) \
{ *p = (FULL_CHAR) '\0'; \
break; \
} \
AssignFirstHalf(*p, *q++ - '0' + 2); \
if( *q == (FULL_CHAR) '\0' ) \
{ *++p = (FULL_CHAR) '\0'; \
break; \
} \
AssignLastHalf(*p, *q++ - '0' + 2); \
p++; \
} \
}
/*****************************************************************************/
/* */
/* UncompressValue(q, p) */
/* */
/* Uncompress value string q, placing the result in p. */
/* */
/*****************************************************************************/
#define UncompressValue(compressed, uncompressed) \
/* FULL_CHAR *compressed, *uncompressed; */ \
{ register FULL_CHAR *p, *q; \
p = compressed; q = uncompressed; \
for( ; ; ) \
{ \
if( FirstHalf(*p) == '\0' ) break; \
*q++ = FirstHalf(*p) + '0' - 2; \
if( LastHalf(*p) == '\0' ) break; \
*q++ = LastHalf(*p) + '0' - 2; \
p++; \
} \
*q = (FULL_CHAR) '\0'; \
}
/*@::AltCompressValue(), AltUncompressValue()@********************************/
/* */
/* AltCompressValue(compressed, uncompressed) */
/* */
/* Compress value string, placing the result in compressed. */
/* */
/* This is an alternative compression scheme to the one given above, which */
/* should give better compression. The result is a sequence of pairs of */
/* the form (skip, value) saying that we are to skip so many places and */
/* then insert the given non-zero value. All the other values are zero. */
/* Skip values are 4-bit numbers (maximum skip is 15, but we will insert */
/* a 15 skip with a zero value in the rare case of skipping further). */
/* Values are also 4-bit numbers, known to be non-zero. So the memory */
/* cost is 8 bits per non-zero value. */
/* */
/*****************************************************************************/
#define CharPack(ch, a, b) (ch = ((a) << 4) | (b))
#define CharUnPack(ch, a, b) ((a) = (ch) >> 4, (b) = (ch) & 15)
#define AltCompressValue(compressed, uncompressed) \
/* FULL_CHAR *compressed, *uncompressed; */ \
{ register FULL_CHAR *p, *q, *prev; \
prev = (uncompressed) - 1; p = (compressed); \
for( q = (uncompressed); *q != (FULL_CHAR) '\0'; q++ ) \
{ \
if( *q != (FULL_CHAR) '0' || q - prev - 1 >= 15 ) \
{ \
CharPack(*p++, q - prev - 1, *q - '0' + 2); \
prev = q; \
} \
} \
*p++ = (FULL_CHAR) '\0'; \
}
/*****************************************************************************/
/* */
/* AltUncompressValue(q, p) */
/* */
/* Uncompress value string q, placing the result in p. */
/* */
/*****************************************************************************/
#define AltUncompressValue(compressed, uncompressed) \
{ register FULL_CHAR *p, *q, xval; int i, skip; \
q = (uncompressed); \
for( p = (compressed); *p != (FULL_CHAR) '\0'; p++ ) \
{ CharUnPack(*p, skip, xval); \
for( i = 0; i < skip; i++ ) \
*q++ = (FULL_CHAR) '0'; \
*q++ = (FULL_CHAR) (xval + '0' - 2); \
} \
*q++ = (FULL_CHAR) '\0'; \
debug1(DHY, D, "AltUncompressValue returning %s", (uncompressed)); \
}
/* ***
static void AltUncompressValue(FULL_CHAR *compressed, FULL_CHAR *uncompressed)
{ register FULL_CHAR *p, *q, xval; int i, skip;
q = uncompressed;
for( p = compressed; *p != (FULL_CHAR) '\0'; p++ )
{ CharUnPack(*p, skip, xval);
for( i = 0; i < skip; i++ )
*q++ = (FULL_CHAR) '0';
*q++ = (FULL_CHAR) (xval + '0' - 2);
}
*q++ = (FULL_CHAR) '\0';
debug1(DHY, D, "AltUncompressValue returning %s", uncompressed);
}
*** */
/*@@**************************************************************************/
/* */
/* ClassConvert(in, out, T, fname, hline_num) */
/* */
/* Set out[i] to the character class of in[i] in T, for all i. */
/* */
/*****************************************************************************/
#define ClassConvert(in, out, T, fname, hline_num) \
{ int i; \
for( i = 0; in[i] != '\0'; i++ ) \
if( T->class[in[i]] != 0 ) out[i] = T->class[in[i]]; \
else \
Error(36, 2, "in hyphenation file %s, line %d: character (octal %o) is not in any class", \
FATAL, no_fpos, fname, hline_num, in[i]); \
out[i] = '\0'; \
} /* end ClassConvert */
/*@::findrep(), TrieRetrieve(), ShowRate()@***********************************/
/* */
/* findrep(i, T) Returns one character whose class in T is i. */
/* */
/*****************************************************************************/
#if DEBUG_ON
static FULL_CHAR findrep(int i, TRIE T)
{ int ch;
for( ch = 0; ch < MAX_CHAR; ch++ )
if( T->class[ch] == i ) return (FULL_CHAR) ch;
Error(36, 3, "DoTriePrint: findrep failed", INTERN, no_fpos);
return (FULL_CHAR) ch; /* never reached, but gcc doesn't know that */
} /* end findrep */
#if 0
/*****************************************************************************/
/* */
/* static FULL_CHAR *TrieRetrieve(key, T) */
/* */
/* Retrieve the value associated with key in T, or NULL if not present. */
/* This procedure is not presently in use. */
/* */
/*****************************************************************************/
static FULL_CHAR *TrieRetrieve(FULL_CHAR *key, TRIE T)
{ FULL_CHAR str[MAX_BUFF]; int i, curr_node, next_node, pos;
debug1(DHY, DD, "TrieRetrieve(%s, T)", key);
ClassConvert(key, str, T, STR_EMPTY, 0);
/* invariant: curr_node is an existing node of T with prefix str[0..i-1] */
curr_node = i = 0;
for(;;)
{
/* if next_node is 0, the string was never inserted */
next_node = T->node_mem[curr_node + str[i]];
if( next_node == 0 ) return (FULL_CHAR *) NULL;
/* if next_node < 0 it represents an offset into the string memory */
if( next_node < 0 )
{ pos = - next_node;
if( str[i] != '\0' )
{ do
{ if( str[++i] != T->string_mem[pos++] ) return (FULL_CHAR *) NULL;
} while( str[i] != '\0' );
}
return &(T->string_mem[pos]);
}
/* otherwise next_node is the trie node to be searched next */
curr_node = NODE_MULT*next_node; i++;
}
} /* end TrieRetrieve */
#endif
/*****************************************************************************/
/* */
/* static ShowRate(key, start, stop, rate, fp) */
/* */
/* Debug print of key[] and rate[] on file fp. */
/* */
/*****************************************************************************/
static void ShowRate(FULL_CHAR *key, int start, int stop, FULL_CHAR *rate,
FILE *fp)
{ int i;
fprintf(fp, "key: ");
for( i = start; i < stop; i++ ) fprintf(fp, " %c", key[i]);
fprintf(fp, "\nrate:");
for( i = 0; rate[i] != '\0'; i++ ) fprintf(fp, " %c", rate[i]);
fprintf(fp, "\n");
} /* end ShowRate */
/*@::DoTriePrint(), TriePrint()@**********************************************/
/* */
/* static DoTriePrint(T, node, len, fp) */
/* */
/* Print on file fp the subset of the entries of trie T stored in node and */
/* its descendants. The node has prefix prefix[0..len-1]. */
/* */
/*****************************************************************************/
static FULL_CHAR prefix[MAX_BUFF];
static void DoTriePrint(TRIE T, int node, int len, FILE *fp)
{ int i, next_node, pos; FULL_CHAR str[20];
for( i = 0; i < T->class_count; i++ )
{
/* if next_node < 0, have string to print */
next_node = T->node_mem[node + i];
if( next_node < 0 )
{
prefix[len] = '\0';
fprintf(fp, "%s", prefix);
pos = - next_node;
if( i != 0 )
{
fprintf(fp, "%c", findrep(i, T));
while( T->string_mem[pos] != '\0' )
{ fprintf(fp, "%c", findrep(T->string_mem[pos], T));
pos++;
}
pos++;
}
AltUncompressValue(&(T->string_mem[pos]), str);
fprintf(fp, " %s\n", str);
}
/* else if next_node > 0 have a child node to explore */
else if( next_node > 0 )
{ assert( i > 0, "DoTriePrint: i == 0!" );
prefix[len] = findrep(i, T);
prefix[len+1] = '\0';
DoTriePrint(T, NODE_MULT*next_node, len+1, fp);
}
}
} /* end DoTriePrint */
/*****************************************************************************/
/* */
/* static TriePrint(T, fp) */
/* */
/* Print trie T on file fp. */
/* */
/*****************************************************************************/
static void TriePrint(TRIE T, FILE *fp)
{ int i, ch;
assert( T-> magic == TRIE_MAGIC, "TriePrint: magic!" );
fprintf(fp, "Classes:");
for( i = 1; i < T->class_count; i++ )
{ fprintf(fp, " ");
for( ch = 0; ch < MAX_CHAR; ch++ )
if( T->class[ch] == i ) fprintf(fp, "%c", ch);
}
fprintf(fp, "\n");
fprintf(fp, "Node space: %d capacity, %d used\n", T->node_lim, T->node_free);
fprintf(fp, "String space: %d capacity, %d used\n", T->string_lim,
T->string_lim - T->string_first);
prefix[0] = '\0';
DoTriePrint(T, 0, 0, fp);
} /* end TriePrint */
#endif
/*@::NewTrie(), NewTrieString(), NewTrieNode()@*******************************/
/* */
/* static TRIE NewTrie(node_lim, string_lim) */
/* */
/* Initialize a new trie with this much space for nodes and strings. */
/* */
/*****************************************************************************/
static TRIE NewTrie(unsigned node_lim, unsigned string_lim)
{ TRIE T; int i;
debug2(DHY, DD, "NewTrie(%d, %d)", node_lim, string_lim);
ifdebug(DMA, DD, DebugRegisterUsage(MEM_HYPH_PATS, 1,
sizeof(struct trie_rec) + node_lim*sizeof(short)+string_lim*sizeof(char)));
T = (TRIE) malloc( sizeof(struct trie_rec)
+ node_lim*sizeof(short) + string_lim*sizeof(char));
if( T == (TRIE) NULL )
Error(36, 4, "run out of memory while constructing hyphenation table",
FATAL, no_fpos);
T->magic = TRIE_MAGIC; T->class_count = 1;
for( i = 0; i < MAX_CHAR; i++ ) T->class[i] = 0;
T->node_mem = (short *) ( (char *) T + sizeof(struct trie_rec));
T->node_lim = node_lim; T->node_free = 0;
T->string_mem = (FULL_CHAR *) &(T->node_mem[node_lim]);
T->string_lim = T->string_first = string_lim;
debug0(DHY, DD, "NewTrie returning.");
return T;
} /* end NewTrie */
/*****************************************************************************/
/* */
/* static short NewTrieString(str, T) */
/* */
/* Copy a new string into T, and return its offset in string_mem; */
/* */
/*****************************************************************************/
static short NewTrieString(FULL_CHAR *str, TRIE T)
{ short res = T->string_first - StringLength(str) - 1;
if( res >= 0 )
{ T->string_first = res; StringCopy(&(T->string_mem[res]), str);
}
return res;
} /* end NewTrieString */
/*****************************************************************************/
/* */
/* ststic int NewTrieNode(T) */
/* */
/* Allocate a new empty trie node in T, and return its offset in node_mem. */
/* */
/*****************************************************************************/
static int NewTrieNode(TRIE T)
{ int i; int res;
if( T->node_free + T->class_count > T->node_lim )
Error(36, 5, "hyphenation trie node limit exceeded", INTERN, no_fpos);
res = T->node_free; T->node_free += T->class_count;
for( i = res; i < T->node_free; i++ ) T->node_mem[i] = 0;
return res;
} /* end NewTrieNode */
/*@::AddClassToTrie(), TrieInsert()@******************************************/
/* */
/* static AddClassToTrie(str, T) */
/* */
/* Add a new character class, whose members are the characters of str, to */
/* trie T. This cannot occur after the first insertion. */
/* */
/*****************************************************************************/
static void AddClassToTrie(FULL_CHAR *str, TRIE T)
{ int i;
assert( T->string_first == T->string_lim, "AddClassToTrie: after insertion");
for( i = 0; str[i] != '\0'; i++ )
if( T->class[str[i]] == 0 ) T->class[str[i]] = T->class_count;
else Error(36, 6, "hyphenation class of %c may not be changed",
INTERN, no_fpos, str[i]);
T->class_count++;
} /* end AddClassToTrie */
/*****************************************************************************/
/* */
/* static BOOLEAN TrieInsert(key, value, T, fname, hline_num) */
/* */
/* Insert a new key and value into trie T (originating in file fname on */
/* line hline_num). */
/* */
/*****************************************************************************/
static BOOLEAN TrieInsert(FULL_CHAR *key, FULL_CHAR *value, TRIE T,
FULL_CHAR *fname, int hline_num)
{ FULL_CHAR str[MAX_BUFF], compressed_value[MAX_BUFF];
int i, curr_node, next_node, pos, ch; short strpos;
debug2(DHY, DD, "TrieInsert(%s, %s, T)", key, value);
/* if first insertion, add one node after making sure class_count is even */
if( T->node_free == 0 )
{ T->class_count = NODE_MULT * ceiling(T->class_count, NODE_MULT);
ch = NewTrieNode(T);
}
AltCompressValue(compressed_value, value);
/* invariant: curr_node is an existing node of T with prefix str[0..i-1] */
ClassConvert(key, str, T, fname, hline_num);
curr_node = i = 0;
for(;;)
{
/* if str is ended, add compressed_value only to string memory */
if( str[i] == '\0' )
{ if( T->node_mem[curr_node] != 0 )
Error(36, 7, "hyphenation string %s already inserted",
INTERN, no_fpos, key);
else
{
strpos = NewTrieString(compressed_value, T);
if( strpos < 0 )
{ debug0(DHY, DD, "TrieInsert returning FALSE (trie full)");
return FALSE;
}
T->node_mem[curr_node] = - strpos;
}
debug0(DHY, DD, "TrieInsert returning TRUE (empty suffix).");
return TRUE;
}
/* if next position is unoccupied, store remainder of str and value */
next_node = T->node_mem[curr_node + str[i]];
if( next_node == 0 )
{ ch = NewTrieString(compressed_value, T);
if( ch < 0 )
{ debug0(DHY, DD, "TrieInsert returning FALSE (trie full)");
return FALSE;
}
strpos = NewTrieString(&str[i+1], T);
if( strpos < 0 )
{ debug0(DHY, DD, "TrieInsert returning FALSE (trie full)");
return FALSE;
}
T->node_mem[curr_node + str[i]] = - strpos;
debug0(DHY, DD, "TrieInsert returning (non-empty suffix).");
return TRUE;
}
/* if next position is occupied by a non-empty string, move that */
/* string down one level and replace it by a trie node */
if( next_node < 0 )
{ pos = - next_node;
ch = T->string_mem[pos];
if( T->string_first == pos ) T->string_first++;
T->node_mem[curr_node + str[i]] = next_node = NewTrieNode(T)/NODE_MULT;
T->node_mem[NODE_MULT*next_node + ch] = -(pos+1);
}
/* now next is the offset of the next node to be searched */
curr_node = NODE_MULT*next_node; i++;
}
} /* end TrieInsert */
/*@::BeGetChar(), BePutChar(), BeGetShort(), BePutShort(), etc.@**************/
/* */
/* BeGetChar(fp, pv) */
/* BePutChar(fp, v) */
/* BeGetShort(fp, pv) */
/* BePutShort(fp, v) */
/* BeGetInt(fp, pv) */
/* BePutInt(fp, v) */
/* */
/* Get char, short, or int pv from file fp, and put char, short, or int */
/* onto file fp. These routines are designed so that the file can be */
/* written or read safely by big-endian and little-endian architectures; */
/* this is accomplished by reading and writing one byte at a time to and */
/* from a big-endian format file. All return 0 on success, -1 on failure. */
/* Thanks to David W. Sanderson for this code. */
/* */
/*****************************************************************************/
#define BeGetChar(fp, pv) ( (c = getc(fp)) == EOF ? -1 : (*pv = c & 0xFF, 0) )
#define BePutChar(fp, v) ( putc( (char) (v & 0xFF), fp), 0 )
#define BeGetShort(fp, pv) \
( (c = getc(fp)) == EOF ? -1 : \
( *pv = (c & 0xFF) << 8, \
(c = getc(fp)) == EOF ? -1 : (*pv |= c & 0xFF, 0) \
) \
)
#define BePutShort(fp, v) \
( putc((v >> 8) & 0xFF, fp), putc(v & 0xFF, fp), 0 )
static int BeGetInt(FILE *fp, int *pv)
{ int c;
if ((c = getc(fp)) == EOF) return -1;
*pv = (c & 0xFF) << 24;
if ((c = getc(fp)) == EOF) return -1;
*pv |= (c & 0xFF) << 16;
if ((c = getc(fp)) == EOF) return -1;
*pv |= (c & 0xFF) << 8;
if ((c = getc(fp)) == EOF) return -1;
*pv |= c & 0xFF;
return 0;
}
static int BePutInt(FILE *fp, int v)
{
putc((v >> 24) & 0xFF, fp);
putc((v >> 16) & 0xFF, fp);
putc((v >> 8) & 0xFF, fp);
putc(v & 0xFF, fp);
return 0;
}
/*@::CompressTrie(), TrieRead(), AccumulateRating()@**************************/
/* */
/* static CompressTrie(T) */
/* */
/* Compress trie T and return its length in characters. */
/* */
/*****************************************************************************/
static void CompressTrie(TRIE T)
{ FULL_CHAR *p, *q; int len, i;
debug0(DHY, DD, "CompressTrie(T), T =");
debug2(DHY, DD, "Node space: %d capacity, %d used\n",
T->node_lim, T->node_free);
debug2(DHY, DD, "String space: %d capacity, %d used\n",
T->string_lim, T->string_lim - T->string_first);
ifdebug(DHY, DD, TriePrint(T, stderr));
T->node_lim = T->node_free;
for( i = 0; i < T->node_lim; i++ )
if( T->node_mem[i] < 0 )
T->node_mem[i] = - ( -T->node_mem[i] - T->string_first);
p = (FULL_CHAR *) &(T->node_mem[T->node_free]);
q = &(T->string_mem[T->string_first]);
len = T->string_lim - T->string_first;
for( i = 0; i < len; i++ ) *p++ = *q++;
T->string_mem = (FULL_CHAR *) &(T->node_mem[T->node_lim]);
T->string_first = 0;
T->string_lim = len;
len = sizeof(struct trie_rec) + T->node_lim * sizeof(short)
+ T->string_lim * sizeof(FULL_CHAR);
debug1(DHY, DD, "CompressTrie returning; len = %d, T =", len);
ifdebug(DHY, DD, TriePrint(T, stderr));
} /* end CompressTrie */
/*****************************************************************************/
/* */
/* static TRIE TrieRead(lnum, success) */
/* */
/* Read in a packed trie if possible, otherwise pack an unpacked one. */
/* The trie is to be for language lnum. */
/* */
/* Boolean success is set to true if no errors were encountered. If the */
/* file read was a placeholder, success will be true but still a null */
/* TRIE will be returned. */
/* */
/*****************************************************************************/
#define START_STATE 0
#define CLASSES_STATE 1
#define EXCEPTIONS_STATE 2
#define LENGTH_LIMIT_STATE 3
#define PATTERNS_STATE 4
static TRIE TrieRead(LANGUAGE_NUM lnum, BOOLEAN *success)
{ TRIE T; FILE_NUM unpacked_fnum, packed_fnum; OBJECT fname;
FILE *unpacked_fp, *packed_fp; unsigned len;
int prev, i, j, c, state, hline_num, length_limit;
#if DEBUG_ON
int icount = 0;
#endif
debug2(DHY, DD, "TrieRead(%d %s)", lnum,
lnum == 0 ? STR_NONE : LanguageString(lnum));
/* get hyphenation file name from language module */
fname = LanguageHyph(lnum);
assert( fname == nilobj || is_word(type(fname)), "TrieRead: fname!" );
if( fname == nilobj )
{ *success = FALSE;
return (TRIE) NULL;
}
/* define and open packed file */
debug0(DFS, DD, " calling DefineFile from TrieRead (1)");
packed_fnum = DefineFile(string(fname), HYPH_PACKED_SUFFIX,
&fpos(fname), HYPH_PACKED_FILE, HYPH_PATH);
if( InitializeAll )
{
/* initializing so want to reconstruct packed files */
/* thanks to Ian Jackson <ian@chiark.greenend.org.uk> for this */
packed_fp = NULL;
}
else
{
/* not initializing so use existing packed files if possible */
packed_fp = OpenFile(packed_fnum, FALSE, FALSE);
}
if( packed_fp == NULL )
{
/* no packed file, so define and open unpacked one instead */
FULL_CHAR str[MAX_BUFF], key[MAX_BUFF], value[MAX_BUFF],
buff[MAX_BUFF+10];
int bpos, bcount;
debug0(DFS, DD, " calling DefineFile from TrieRead (2)");
unpacked_fnum = DefineFile(string(fname), HYPH_SUFFIX,
&fpos(fname), HYPH_FILE, HYPH_PATH);
unpacked_fp = OpenFile(unpacked_fnum, FALSE, FALSE);
if( unpacked_fp == NULL )
{ Error(36, 8, "cannot open hyphenation file %s",
WARN, no_fpos, FileName(unpacked_fnum));
*success = FALSE;
return (TRIE) NULL;
}
/* check that first line contains magic header or stub */
if( StringFGets(str, MAX_BUFF, unpacked_fp) == NULL ||
( !StringEqual(str, AsciiToFull("Lout hyphenation information\n")) &&
!StringEqual(str, AsciiToFull("Lout hyphenation placeholder\n")) )
)
Error(36, 9, "header line of hyphenation file %s missing",
FATAL, no_fpos, FileName(unpacked_fnum));
/* if file is just a placeholder, exit silently with success */
if( !StringEqual(str, AsciiToFull("Lout hyphenation information\n")) )
{ *success = TRUE;
return (TRIE) NULL;
}
/* read the classes, exceptions, and patterns from the unpacked file */
T = NewTrie( (unsigned) 120000, (unsigned) 32767);
state = START_STATE;
hline_num = 1;
length_limit = 0;
while( StringFGets(buff, MAX_BUFF, unpacked_fp) != NULL )
{
hline_num++; bpos = 0;
while( sscanf( (char *) &buff[bpos], "%s%n", str, &bcount) == 1 &&
str[0] != '%' )
{
bpos += bcount;
DecodeEscapes(str, string(fname), hline_num);
switch( state )
{
case START_STATE:
if( !StringEqual(str, AsciiToFull("Classes:")) )
Error(36, 10, "Classes heading of hyphenation file %s missing",
FATAL, no_fpos, FileName(unpacked_fnum));
state = CLASSES_STATE;
break;
case CLASSES_STATE:
if( StringEqual(str, AsciiToFull("Exceptions:")) )
{ state = EXCEPTIONS_STATE;
}
else if( StringEqual(str, AsciiToFull("Patterns:")) )
{ state = PATTERNS_STATE;
}
else if( StringEqual(str, AsciiToFull("LengthLimit:")) )
{ state = LENGTH_LIMIT_STATE;
}
else
{ debug1(DHY, DD, "adding class %s", str);
AddClassToTrie(str, T);
}
break;
case EXCEPTIONS_STATE:
if( StringEqual(str, AsciiToFull("Patterns:")) )
{ state = PATTERNS_STATE;
}
else if( StringEqual(str, AsciiToFull("LengthLimit:")) )
{ state = LENGTH_LIMIT_STATE;
}
else
{ prev = CH_EIGHT; j = 0;
key[j] = '.', value[j++] = prev, prev = CH_EIGHT;
for( i = 0; str[i] != '\0'; i++ )
{ if( str[i] == CH_HYPHEN ) prev = CH_NINE;
else key[j] = str[i], value[j++] = prev, prev = CH_EIGHT;
}
key[j] = '.', value[j++] = prev, prev = CH_EIGHT;
key[j] = '\0'; value[j] = prev; value[j+1] = '\0';
if( !TrieInsert(key, value, T, string(fname), hline_num) )
{
Error(36, 11, "hyphenation file %s%s is too large (at line %d)",
WARN, &fpos(fname), string(fname), HYPH_SUFFIX, hline_num);
*success = FALSE;
return (TRIE) NULL;
}
}
break;
case LENGTH_LIMIT_STATE:
if( StringEqual(str, AsciiToFull("Patterns:")) )
{ state = PATTERNS_STATE;
}
else if( sscanf( (char *) str, "%d", &length_limit) != 1 )
{
Error(36, 20, "bad LengthLimit in hyphenation file %s%s (line %d)",
WARN, &fpos(fname), string(fname), HYPH_SUFFIX, hline_num);
*success = FALSE;
return (TRIE) NULL;
}
break;
case PATTERNS_STATE:
prev = CH_ZERO; j = 0;
for( i = 0; str[i] != '\0'; i++ )
{ if( decimaldigit(str[i]) ) prev = str[i];
else key[j] = str[i], value[j++] = prev, prev = CH_ZERO;
}
key[j] = '\0'; value[j] = prev; value[j+1] = '\0';
if( length_limit == 0 || j <= length_limit )
{
debug3(DHY, DD, "TrieInsert(%s, %s, T) [%d]", key, value, ++icount);
if( !TrieInsert(key, value, T, string(fname), hline_num) )
{
Error(36, 12, "hyphenation file %s%s is too large (at line %d)",
WARN, &fpos(fname), string(fname), HYPH_SUFFIX, hline_num);
*success = FALSE;
return (TRIE) NULL;
}
}
break;
default:
assert(FALSE, "TrieRead: state");
break;
} /* end switch */
} /* end while */
} /* end while */
if( state != PATTERNS_STATE )
Error(36, 13, "format error in hyphenation file %s",
FATAL, no_fpos, FileName(unpacked_fnum));
fclose(unpacked_fp);
CompressTrie(T);
/* Avoid writing and re-reading a cache file in benchmarking mode */
#if 0
/* write the compressed trie out to the packed file */
/* cannot use FileName(packed_fnum) because path won't be right */
StringCopy(buff, FileName(unpacked_fnum));
StringCopy(&buff[StringLength(buff) - StringLength(HYPH_SUFFIX)],
HYPH_PACKED_SUFFIX);
packed_fp = StringFOpen(buff, WRITE_BINARY);
if( packed_fp == NULL )
Error(36, 14, "cannot write to hyphenation file %s", FATAL,no_fpos,buff);
BePutInt(packed_fp, T->magic);
BePutInt(packed_fp, T->class_count);
for( i = 0; i < MAX_CHAR; i++ ) BePutChar(packed_fp, T->class[i]);
/* BePutInt(packed_fp, 0); */ /* placeholder for node_mem now omitted */
BePutInt(packed_fp, T->node_lim);
BePutInt(packed_fp, T->node_free);
/* BePutInt(packed_fp, 0); */ /* placeholder for string_mem now omitted */
BePutInt(packed_fp, T->string_lim);
BePutInt(packed_fp, T->string_first);
for( i=0; i < T->node_free; i++ ) BePutShort(packed_fp, T->node_mem[i]);
for( i=0; i < T->string_lim; i++) BePutChar(packed_fp, T->string_mem[i]);
fclose(packed_fp);
/* free T */
ifdebug(DMA, DD, DebugRegisterUsage(MEM_HYPH_PATS, 1,
sizeof(struct trie_rec) + 120000*sizeof(short)+32767*sizeof(char)));
free(T);
/* now try again to open packed_fnum, the file just written */
packed_fp = OpenFile(packed_fnum, FALSE, FALSE);
if( packed_fp == NULL )
Error(36, 15, "cannot open hyphenation file %s",
FATAL, no_fpos, FileName(packed_fnum));
#endif
} /* end if( packed_fp == NULL ) */
#if 0
/* now packed hyphenation file is open, read it in */
fseek(packed_fp, 0L, SEEK_END);
len = (unsigned) ftell(packed_fp); rewind(packed_fp);
ifdebug(DMA, DD, DebugRegisterUsage(MEM_HYPH_PATS, 1, len));
/* the 2*sizeof(void*) is for the sizes of node_mem and string_mem */
T = (TRIE) malloc(len + 2*sizeof(void*));
if( T == (TRIE) NULL )
Error(36, 16, "run out of memory while reading hyphenation table",
FATAL, no_fpos);
if( BeGetInt(packed_fp, &T->magic) != 0 )
Error(36, 17, "error on read from packed hyphenation file %s",
FATAL, no_fpos, FileName(packed_fnum));
if( T->magic != TRIE_MAGIC )
Error(36, 18, "bad magic number in hyphenation file %s",
FATAL, no_fpos, FileName(packed_fnum));
BeGetInt(packed_fp, &T->class_count);
for( i = 0; i < MAX_CHAR; i++ ) BeGetChar(packed_fp, &T->class[i]);
/* BeGetInt(packed_fp, &i); */ /* placeholder for node_mem now omitted */
BeGetInt(packed_fp, &T->node_lim);
BeGetInt(packed_fp, &T->node_free);
/* BeGetInt(packed_fp, &i); */ /* placeholder for string_mem now omitted */
BeGetInt(packed_fp, &T->string_lim);
BeGetInt(packed_fp, &T->string_first);
T->node_mem = (short *) ( (char *) T + sizeof(struct trie_rec) );
T->string_mem = (FULL_CHAR *) &(T->node_mem[T->node_lim]);
for( i = 0; i < T->node_free; i++ ) BeGetShort(packed_fp, &T->node_mem[i]);
for( i = 0; i < T->string_lim; i++ ) BeGetChar(packed_fp, &T->string_mem[i]);
fclose(packed_fp);
#endif
/* debug and exit */
debug0(DHY, DD, "TrieRead returning, T =");
*success = TRUE;
ifdebug(DHY, DD, TriePrint(T, stderr));
return T;
} /* end TrieRead */
/*****************************************************************************/
/* */
/* AccumulateRating(x, y) */
/* */
/* Accumulate the hyphenation rating string x into y. */
/* */
/*****************************************************************************/
#define AccumulateRating(x, y) \
{ FULL_CHAR *p = x, *q = y; \
while( *p ) \
{ if( *p > *q ) *q = *p; \
p++, q++; \
} \
} /* end AccumulateRating */
/*@::ReadHyphTable()@*********************************************************/
/* */
/* BOOLEAN ReadHyphTable(lnum) */
/* */
/* Read hyphenation table for language lnum. */
/* */
/*****************************************************************************/
BOOLEAN ReadHyphTable(LANGUAGE_NUM lnum)
{ BOOLEAN res;
debug1(DHY, DD, "ReadHyphTable(%d)", lnum);
assert(lnum > 0, "ReadHyphTable: lnum <= 0!");
assert(HyphTables[lnum]==(TRIE) NULL && !TriedFile[lnum], "ReadHyphTable!");
HyphTables[lnum] = TrieRead(lnum, &res);
TriedFile[lnum] = TRUE;
debug2(DHY, DD, "ReadHyphTable(%d) returning %s", lnum, bool(res));
return res;
} /* end ReadHyphTable */
/*@::Hyphenate@***************************************************************/
/* */
/* OBJECT Hyphenate(x) */
/* */
/* Hyphenate ACAT object x, returning the hyphenated result. */
/* */
/*****************************************************************************/
OBJECT Hyphenate(OBJECT x)
{ OBJECT link, y, z, next_link; TRIE T; LANGUAGE_NUM lnum;
FULL_CHAR str[MAX_WORD+2], rate[MAX_WORD+3], val[MAX_WORD+3],
*class, *key, *ss, *s, *p, *rem, *lig, *a, *b;
int start, stop, i, curr_node, next_node, pos;
BOOLEAN hyphenated, success;
assert( type(x) == ACAT, "Hyphenate: type(x) != ACAT!" );
debug1(DHY, D, "Hyphenate(%s)", EchoObject(x));
/* for each word y of x, try to hyphenate it */
for( link = Down(x); link != x; link = NextDown(link) )
{ Child(y, link);
if( !is_word(type(y)) || string(y)[0] == '\0' || !word_hyph(y) )
{
if( type(y) == GAP_OBJ && mode(gap(y)) == HYPH_MODE )
nobreak(gap(y)) = FALSE;
continue;
}
debug1(DHY, DD, "Hyphenate() examining %s", EchoObject(y));
/* determine T, the trie to use */
lnum = word_language(y);
if( lnum == 0 )
Error(36, 19, "no current language for word %s",
FATAL, &fpos(y), string(y));
T = HyphTables[lnum];
/* if no trie is present, try to get it from a file */
if( T == (TRIE) NULL )
{ if( !TriedFile[lnum] )
{ T = HyphTables[lnum] = TrieRead(lnum, &success);
TriedFile[lnum] = TRUE;
}
if( T == (TRIE) NULL )
{ debug1(DHY, DD, "Hyphenate continuing (no trie for %s)", string(y));
continue;
}
}
/* start := index of first letter of y, stop := index following last */
key = string(y); class = T->class;
for( start = 0; class[key[start]] == PUNCT_CLASS; start++ );
for( stop = start; class[key[stop]] > PUNCT_CLASS; stop++ );
/* if a - ended the run, hyphenate there only */
if( key[stop] == CH_HYPHEN )
{
/* actually, don't hyphenate if the hyphen is last in the word [thanks Uwe] */
if( key[stop+1] == '\0' )
continue;
next_link = NextDown(link);
z = MakeWord(WORD, &key[stop+1], &fpos(y));
word_font(z) = word_font(y);
word_colour(z) = word_colour(y);
word_outline(z) = word_outline(y);
word_language(z) = word_language(y);
word_hyph(z) = word_hyph(y);
underline(z) = underline(y);
debug1(DHY, DD, "Hyphenate (hyph case) making fragment %s", string(z));
FontWordSize(z);
Link(NextDown(link), z);
New(z, GAP_OBJ);
hspace(z) = vspace(z) = 0;
SetGap(gap(z), FALSE, FALSE, TRUE, FIXED_UNIT, HYPH_MODE, 0);
underline(z) = underline(y);
Link(NextDown(link), z);
Link(z, MakeWord(WORD, STR_GAP_ZERO_HYPH, &fpos(y)));
key[stop + 1] = '\0';
FontWordSize(y);
/* *** link = PrevDown(next_link); */
link = NextDown(link);
continue;
}
/* do not hyphenate if less than 5 letters, or a kill char is nearby */
if( stop - start < 5 ) continue;
if( key[stop] != '\0' && class[key[stop]] == KILL_CLASS ) continue;
/* let str[] be the converted substring, let rate[] be all CH_ZERO */
str[0] = PUNCT_CLASS; rate[0] = CH_ZERO;
for( i = 0; i < stop - start; i++ )
{ str[i+1] = class[key[start + i]];
rate[i+1] = CH_ZERO;
}
str[i+1] = PUNCT_CLASS; rate[i+1] = CH_ZERO;
str[i+2] = '\0'; rate[i+2] = CH_ZERO;
rate[i+3] = '\0';
ifdebug(DHY, DD, ShowRate(key, start, stop, rate, stderr));
/* for each suffix of str[], accumulate patterns matching its prefixes */
ss = str;
do
{
ifdebug(DHY, DD,
fprintf(stderr, "trying suffix \"");
for( p = ss; *p != 0; p++ ) fprintf(stderr, "%c", findrep(*p, T));
fprintf(stderr, "\"\n");
);
/* accumulate all prefixes of ss */
curr_node = 0; s = ss;
for(;;)
{
/* if curr_node has empty string, that is one prefix */
pos = T->node_mem[curr_node];
if( pos < 0 )
{ AltUncompressValue(&(T->string_mem[- pos]), val);
AccumulateRating(val, rate+(ss-str));
debug1(DHY, DD, " found %s", val);
}
/* if ss is finished, no other prefixes are possible */
if( *s == '\0' ) break;
/* determine next_node and break if empty */
next_node = T->node_mem[curr_node + *s];
if( next_node == 0 ) break;
/* if next_node is a string, check whether it is a prefix of ss */
if( next_node < 0 )
{ rem = &(T->string_mem[-next_node]);
do
{ if( *rem == '\0' )
{ AltUncompressValue(rem+1, val);
AccumulateRating(val, rate+(ss-str));
debug1(DHY, DD, " found %s", val);
break;
}
} while( *++s == *rem++ );
break;
}
/* otherwise go on to the next trie node */
curr_node = NODE_MULT*next_node; s++;
}
} while( *(++ss + 1) != PUNCT_CLASS );
ifdebug(DHY, DD, ShowRate(key, start, stop, rate, stderr));
/* set rate[i] to CH_ZERO whenever key[start+i-1] lies within a ligature */
lig = finfo[word_font(y)].lig_table;
for( p = key, i = 2; *p != '\0'; p++, i++ )
{ if( lig[*p] > 1 )
{ a = &lig[ lig[*p] + MAX_CHARS ];
while( *a++ == *p )
{ b = p+1;
while( *a == *b && *(a+1) != '\0' && *b != '\0' ) a++, b++;
if( *(a+1) == '\0' )
{ rate[i] = CH_ZERO;
break;
}
else
{ while( *++a );
a++;
}
}
}
}
ifdebug(DHY, DD, ShowRate(key, start, stop, rate, stderr));
/* now rate[] has accumulated ratings; use it to perform hyphenations */
hyphenated = FALSE;
next_link = NextDown(link);
for( i = stop - start - 1; i >= 3; i-- )
{
/* hyphenate at i if rate[i] is odd */
if( is_odd(rate[i]) )
{ z = MakeWord(WORD, &key[start+i-1], &fpos(y));
word_font(z) = word_font(y);
word_colour(z) = word_colour(y);
word_outline(z) = word_outline(y);
word_language(z) = word_language(y);
word_hyph(z) = word_hyph(y);
underline(z) = underline(y);
debug1(DHY, D, "Hyphenate making fragment %s", string(z));
FontWordSize(z);
Link(NextDown(link), z);
New(z, GAP_OBJ);
hspace(z) = vspace(z) = 0;
SetGap(gap(z), FALSE, FALSE, TRUE, FIXED_UNIT, HYPH_MODE, 0);
underline(z) = underline(y);
Link(NextDown(link), z);
Link(z, MakeWord(WORD, STR_GAP_ZERO_HYPH, &fpos(y)));
key[start + i - 1] = '\0';
hyphenated = TRUE;
}
}
if( hyphenated )
{ FontWordSize(y);
link = PrevDown(next_link);
}
} /* end for each word */
debug3(DHY, D, "Hyphenate returning %s,%s %s",
EchoLength(back(x, COLM)), EchoLength(fwd(x, COLM)), EchoObject(x));
return x;
} /* end Hyphenate */