MultiSource/Benchmarks/nbench/nbench1.c - llvm-test-suite - Git at Google


 /*
 ** nbench1.c
 */

 /********************************
 **       BYTEmark (tm)         **
 ** BYTE NATIVE MODE BENCHMARKS **
 **       VERSION 2             **
 **                             **
 ** Included in this source     **
 ** file:                       **
 **  Numeric Heapsort           **
 **  String Heapsort            **
 **  Bitfield test              **
 **  Floating point emulation   **
 **  Fourier coefficients       **
 **  Assignment algorithm       **
 **  IDEA Encyption             **
 **  Huffman compression        **
 **  Back prop. neural net      **
 **  LU Decomposition           **
 **    (linear equations)       **
 ** ----------                  **
 ** Rick Grehan, BYTE Magazine  **
 *********************************
 **
 ** BYTEmark (tm)
 ** BYTE's Native Mode Benchmarks
 ** Rick Grehan, BYTE Magazine
 **
 ** Creation:
 ** Revision: 3/95;10/95
 **  10/95 - Removed allocation that was taking place inside
 **   the LU Decomposition benchmark. Though it didn't seem to
 **   make a difference on systems we ran it on, it nonetheless
 **   removes an operating system dependency that probably should
 **   not have been there.
 **
 ** DISCLAIMER
 ** The source, executable, and documentation files that comprise
 ** the BYTEmark benchmarks are made available on an "as is" basis.
 ** This means that we at BYTE Magazine have made every reasonable
 ** effort to verify that the there are no errors in the source and
 ** executable code.  We cannot, however, guarantee that the programs
 ** are error-free.  Consequently, McGraw-HIll and BYTE Magazine make
 ** no claims in regard to the fitness of the source code, executable
 ** code, and documentation of the BYTEmark.
 **  Furthermore, BYTE Magazine, McGraw-Hill, and all employees
 ** of McGraw-Hill cannot be held responsible for any damages resulting
 ** from the use of this code or the results obtained from using
 ** this code.
 */

 /*
 ** INCLUDES
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #include <math.h>
 #include "nmglobal.h"
 #include "nbench1.h"
 #include "wordcat.h"

 #ifdef DEBUG
 static int numsort_status=0;
 static int stringsort_status=0;
 #endif

 /*********************
 ** NUMERIC HEAPSORT **
 **********************
 ** This test implements a heapsort algorithm, performed on an
 ** array of longs.
 */

 /**************
 ** DoNumSort **
 ***************
 ** This routine performs the CPU numeric sort test.
 ** NOTE: Last version incorrectly stated that the routine
 **  returned result in # of longword sorted per second.
 **  Not so; the routine returns # of iterations per sec.
 */

 void DoNumSort(void)
 {
 SortStruct *numsortstruct;      /* Local pointer to global struct */
 farlong *arraybase;     /* Base pointers of array */
 long accumtime;         /* Accumulated time */
 double iterations;      /* Iteration counter */
 char *errorcontext;     /* Error context string pointer */
 int systemerror;        /* For holding error codes */

 /*
 ** Link to global structure
 */
 numsortstruct=&global_numsortstruct;

 /*
 ** Set the error context string.
 */
 errorcontext="CPU:Numeric Sort";

 /*
 ** See if we need to do self adjustment code.
 */
 if(numsortstruct->adjust==0)
 {
 	/*
 	** Self-adjustment code.  The system begins by sorting 1
 	** array.  If it does that in no time, then two arrays
 	** are built and sorted.  This process continues until
 	** enough arrays are built to handle the tolerance.
 	*/
 	numsortstruct->numarrays=1;
 	while(1)
 	{
 		/*
 		** Allocate space for arrays
 		*/
 		arraybase=(farlong *)AllocateMemory(sizeof(long) *
 			numsortstruct->numarrays * numsortstruct->arraysize,
 			&systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			FreeMemory((farvoid *)arraybase,
 				  &systemerror);
 			ErrorExit();
 		}

 		/*
 		** Do an iteration of the numeric sort.  If the
 		** elapsed time is less than or equal to the permitted
 		** minimum, then allocate for more arrays and
 		** try again.
 		*/
 		if(DoNumSortIteration(arraybase,
 			numsortstruct->arraysize,
 			numsortstruct->numarrays)>global_min_ticks)
 			break;          /* We're ok...exit */

 		FreeMemory((farvoid *)arraybase,&systemerror);
 		if(numsortstruct->numarrays++>NUMNUMARRAYS)
 		{       printf("CPU:NSORT -- NUMNUMARRAYS hit.\n");
 			ErrorExit();
 		}
 	}
 }
 else
 {       /*
 	** Allocate space for arrays
 	*/
 	arraybase=(farlong *)AllocateMemory(sizeof(long) *
 		numsortstruct->numarrays * numsortstruct->arraysize,
 		&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		FreeMemory((farvoid *)arraybase,
 			  &systemerror);
 		ErrorExit();
 	}

 }
 /*
 ** All's well if we get here.  Repeatedly perform sorts until the
 ** accumulated elapsed time is greater than # of seconds requested.
 */
 accumtime=0L;
 iterations=(double)0.0;

 do {
 	accumtime+=DoNumSortIteration(arraybase,
 		numsortstruct->arraysize,
 		numsortstruct->numarrays);
 	iterations+=(double)1.0;
 } while(iterations < BASE_ITERATIONS * 15);

 /*
 ** Clean up, calculate results, and go home.  Be sure to
 ** show that we don't have to rerun adjustment code.
 */
 FreeMemory((farvoid *)arraybase,&systemerror);

 numsortstruct->sortspersec=iterations *
 	(double)numsortstruct->numarrays / TicksToFracSecs(accumtime);

 if(numsortstruct->adjust==0)
 	numsortstruct->adjust=1;

 #ifdef DEBUG
 if (numsort_status==0) printf("Numeric sort: OK\n");
 numsort_status=0;
 #endif
 return;
 }

 /***********************
 ** DoNumSortIteration **
 ************************
 ** This routine executes one iteration of the numeric
 ** sort benchmark.  It returns the number of ticks
 ** elapsed for the iteration.
 */
 static ulong DoNumSortIteration(farlong *arraybase,
 		ulong arraysize,
 		uint numarrays)
 {
 ulong elapsed;          /* Elapsed ticks */
 ulong i;
 /*
 ** Load up the array with random numbers
 */
 LoadNumArrayWithRand(arraybase,arraysize,numarrays);

 /*
 ** Start the stopwatch
 */
 elapsed=StartStopwatch();

 /*
 ** Execute a heap of heapsorts
 */
 for(i=0;i<numarrays;i++)
 	NumHeapSort(arraybase+i*arraysize,0L,arraysize-1L);

 /*
 ** Get elapsed time
 */
 elapsed=StopStopwatch(elapsed);
 #ifdef DEBUG
 {
 	for(i=0;i<arraysize-1;i++)
 	{       /*
 		** Compare to check for proper
 		** sort.
 		*/
 		if(arraybase[i+1]<arraybase[i])
 		{       printf("Sort Error\n");
 			numsort_status=1;
                         break;
 		}
 	}
 }
 #endif

 return(elapsed);
 }

 /*************************
 ** LoadNumArrayWithRand **
 **************************
 ** Load up an array with random longs.
 */
 static void LoadNumArrayWithRand(farlong *array,     /* Pointer to arrays */
 		ulong arraysize,
 		uint numarrays)         /* # of elements in array */
 {
 long i;                 /* Used for index */
 farlong *darray;        /* Destination array pointer */
 /*
 ** Initialize the random number generator
 */
 /* randnum(13L); */
 randnum((int32)13);

 /*
 ** Load up first array with randoms
 */
 for(i=0L;i<arraysize;i++)
         /* array[i]=randnum(0L); */
 	array[i]=randnum((int32)0);

 /*
 ** Now, if there's more than one array to load, copy the
 ** first into each of the others.
 */
 darray=array;
 while(--numarrays)
 {       darray+=arraysize;
 	for(i=0L;i<arraysize;i++)
 		darray[i]=array[i];
 }

 return;
 }

 /****************
 ** NumHeapSort **
 *****************
 ** Pass this routine a pointer to an array of long
 ** integers.  Also pass in minimum and maximum offsets.
 ** This routine performs a heap sort on that array.
 */
 static void NumHeapSort(farlong *array,
 	ulong bottom,           /* Lower bound */
 	ulong top)              /* Upper bound */
 {
 ulong temp;                     /* Used to exchange elements */
 ulong i;                        /* Loop index */

 /*
 ** First, build a heap in the array
 */
 for(i=(top/2L); i>0; --i)
 	NumSift(array,i,top);

 /*
 ** Repeatedly extract maximum from heap and place it at the
 ** end of the array.  When we get done, we'll have a sorted
 ** array.
 */
 for(i=top; i>0; --i)
 {       NumSift(array,bottom,i);
 	temp=*array;                    /* Perform exchange */
 	*array=*(array+i);
 	*(array+i)=temp;
 }
 return;
 }

 /************
 ** NumSift **
 *************
 ** Peforms the sift operation on a numeric array,
 ** constructing a heap in the array.
 */
 static void NumSift(farlong *array,     /* Array of numbers */
 	ulong i,                /* Minimum of array */
 	ulong j)                /* Maximum of array */
 {
 unsigned long k;
 long temp;                              /* Used for exchange */

 while((i+i)<=j)
 {
 	k=i+i;
 	if(k<j)
 		if(array[k]<array[k+1L])
 			++k;
 	if(array[i]<array[k])
 	{
 		temp=array[k];
 		array[k]=array[i];
 		array[i]=temp;
 		i=k;
 	}
 	else
 		i=j+1;
 }
 return;
 }

 /********************
 ** STRING HEAPSORT **
 ********************/

 /*****************
 ** DoStringSort **
 ******************
 ** This routine performs the CPU string sort test.
 ** Arguments:
 **      requested_secs = # of seconds to execute test
 **      stringspersec = # of strings per second sorted (RETURNED)
 */
 void DoStringSort(void)
 {

 SortStruct *strsortstruct;      /* Local for sort structure */
 faruchar *arraybase;            /* Base pointer of char array */
 long accumtime;                 /* Accumulated time */
 double iterations;              /* # of iterations */
 char *errorcontext;             /* Error context string pointer */
 int systemerror;                /* For holding error code */

 /*
 ** Link to global structure
 */
 strsortstruct=&global_strsortstruct;

 /*
 ** Set the error context
 */
 errorcontext="CPU:String Sort";

 /*
 ** See if we have to perform self-adjustment code
 */
 if(strsortstruct->adjust==0)
 {
 	/*
 	** Initialize the number of arrays.
 	*/
 	strsortstruct->numarrays=1;
 	while(1)
 	{
 		/*
 		** Allocate space for array.  We'll add an extra 100
 		** bytes to protect memory as strings move around
 		** (this can happen during string adjustment)
 		*/
 		arraybase=(faruchar *)AllocateMemory((strsortstruct->arraysize+100L) *
 			(long)strsortstruct->numarrays,&systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			ErrorExit();
 		}

 		/*
 		** Do an iteration of the string sort.  If the
 		** elapsed time is less than or equal to the permitted
 		** minimum, then de-allocate the array, reallocate a
 		** an additional array, and try again.
 		*/
 		if(DoStringSortIteration(arraybase,
 			strsortstruct->numarrays,
 			strsortstruct->arraysize)>global_min_ticks)
 			break;          /* We're ok...exit */

 		FreeMemory((farvoid *)arraybase,&systemerror);
 		strsortstruct->numarrays+=1;
 	}
 }
 else
 {
 	/*
 	** We don't have to perform self adjustment code.
 	** Simply allocate the space for the array.
 	*/
 	arraybase=(faruchar *)AllocateMemory((strsortstruct->arraysize+100L) *
 		(long)strsortstruct->numarrays,&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		ErrorExit();
 	}
 }
 /*
 ** All's well if we get here.  Repeatedly perform sorts until the
 ** accumulated elapsed time is greater than # of seconds requested.
 */
 accumtime=0L;
 iterations=(double)0.0;

 do {
 	accumtime+=DoStringSortIteration(arraybase,
 				strsortstruct->numarrays,
 				strsortstruct->arraysize);
 	iterations+=(double)strsortstruct->numarrays;
 } while(iterations < BASE_ITERATIONS * 5);

 /*
 ** Clean up, calculate results, and go home.
 ** Set flag to show we don't need to rerun adjustment code.
 */
 FreeMemory((farvoid *)arraybase,&systemerror);
 strsortstruct->sortspersec=iterations / (double)TicksToFracSecs(accumtime);
 if(strsortstruct->adjust==0)
 	strsortstruct->adjust=1;
 #ifdef DEBUG
 if (stringsort_status==0) printf("String sort: OK\n");
 stringsort_status=0;
 #endif
 return;
 }

 /**************************
 ** DoStringSortIteration **
 ***************************
 ** This routine executes one iteration of the string
 ** sort benchmark.  It returns the number of ticks
 ** Note that this routine also builds the offset pointer
 ** array.
 */
 static ulong DoStringSortIteration(faruchar *arraybase,
 		uint numarrays,ulong arraysize)
 {
 farulong *optrarray;            /* Offset pointer array */
 unsigned long elapsed;          /* Elapsed ticks */
 unsigned long nstrings;         /* # of strings in array */
 int syserror;                   /* System error code */
 unsigned int i;                 /* Index */
 farulong *tempobase;            /* Temporary offset pointer base */
 faruchar *tempsbase;            /* Temporary string base pointer */

 /*
 ** Load up the array(s) with random numbers
 */
 optrarray=LoadStringArray(arraybase,numarrays,&nstrings,arraysize);

 /*
 ** Set temp base pointers...they will be modified as the
 ** benchmark proceeds.
 */
 tempobase=optrarray;
 tempsbase=arraybase;

 /*
 ** Start the stopwatch
 */
 elapsed=StartStopwatch();

 /*
 ** Execute heapsorts
 */
 for(i=0;i<numarrays;i++)
 {       StrHeapSort(tempobase,tempsbase,nstrings,0L,nstrings-1);
 	tempobase+=nstrings;    /* Advance base pointers */
 	tempsbase+=arraysize+100;
 }

 /*
 ** Record elapsed time
 */
 elapsed=StopStopwatch(elapsed);

 #ifdef DEBUG
 {
 	unsigned long i;
 	for(i=0;i<nstrings-1;i++)
 	{       /*
 		** Compare strings to check for proper
 		** sort.
 		*/
 		if(str_is_less(optrarray,arraybase,nstrings,i+1,i))
 		{       printf("Sort Error\n");
 			stringsort_status=1;
                         break;
 		}
 	}
 }
 #endif

 /*
 ** Release the offset pointer array built by
 ** LoadStringArray()
 */
 FreeMemory((farvoid *)optrarray,&syserror);

 /*
 ** Return elapsed ticks.
 */
 return(elapsed);
 }

 /********************
 ** LoadStringArray **
 *********************
 ** Initialize the string array with random strings of
 ** varying sizes.
 ** Returns the pointer to the offset pointer array.
 ** Note that since we're creating a number of arrays, this
 ** routine builds one array, then copies it into the others.
 */
 static farulong *LoadStringArray(faruchar *strarray, /* String array */
 	uint numarrays,                 /* # of arrays */
 	ulong *nstrings,                /* # of strings */
 	ulong arraysize)                /* Size of array */
 {
 faruchar *tempsbase;            /* Temporary string base pointer */
 farulong *optrarray;            /* Local for pointer */
 farulong *tempobase;            /* Temporary offset pointer base pointer */
 unsigned long curroffset;       /* Current offset */
 int fullflag;                   /* Indicates full array */
 unsigned char stringlength;     /* Length of string */
 unsigned char i;                /* Index */
 unsigned long j;                /* Another index */
 unsigned int k;                 /* Yet another index */
 unsigned int l;                 /* Ans still one more index */
 int systemerror;                /* For holding error code */

 /*
 ** Initialize random number generator.
 */
 /* randnum(13L); */
 randnum((int32)13);

 /*
 ** Start with no strings.  Initialize our current offset pointer
 ** to 0.
 */
 *nstrings=0L;
 curroffset=0L;
 fullflag=0;

 do
 {
 	/*
 	** Allocate a string with a random length no
 	** shorter than 4 bytes and no longer than
 	** 80 bytes.  Note we have to also make sure
 	** there's room in the array.
 	*/
         /* stringlength=(unsigned char)((1+abs_randwc(76L)) & 0xFFL);*/
 	stringlength=(unsigned char)((1+abs_randwc((int32)76)) & 0xFFL);
 	if((unsigned long)stringlength+curroffset+1L>=arraysize)
 	{       stringlength=(unsigned char)((arraysize-curroffset-1L) &
 				0xFF);
 		fullflag=1;     /* Indicates a full */
 	}

 	/*
 	** Store length at curroffset and advance current offset.
 	*/
 	*(strarray+curroffset)=stringlength;
 	curroffset++;

 	/*
 	** Fill up the rest of the string with random bytes.
 	*/
 	for(i=0;i<stringlength;i++)
 	{       *(strarray+curroffset)=
 		        /* (unsigned char)(abs_randwc((long)0xFE)); */
 			(unsigned char)(abs_randwc((int32)0xFE));
 		curroffset++;
 	}

 	/*
 	** Increment the # of strings counter.
 	*/
 	*nstrings+=1L;

 } while(fullflag==0);

 /*
 ** We now have initialized a single full array.  If there
 ** is more than one array, copy the original into the
 ** others.
 */
 k=1;
 tempsbase=strarray;
 while(k<numarrays)
 {       tempsbase+=arraysize+100;         /* Set base */
 	for(l=0;l<arraysize;l++)
 		tempsbase[l]=strarray[l];
 	k++;
 }

 /*
 ** Now the array is full, allocate enough space for an
 ** offset pointer array.
 */
 optrarray=(farulong *)AllocateMemory(*nstrings * sizeof(unsigned long) *
 		numarrays,
 		&systemerror);
 if(systemerror)
 {       ReportError("CPU:Stringsort",systemerror);
 	FreeMemory((void *)strarray,&systemerror);
 	ErrorExit();
 }

 /*
 ** Go through the newly-built string array, building
 ** offsets and putting them into the offset pointer
 ** array.
 */
 curroffset=0;
 for(j=0;j<*nstrings;j++)
 {       *(optrarray+j)=curroffset;
 	curroffset+=(unsigned long)(*(strarray+curroffset))+1L;
 }

 /*
 ** As above, we've made one copy of the offset pointers,
 ** so duplicate this array in the remaining ones.
 */
 k=1;
 tempobase=optrarray;
 while(k<numarrays)
 {       tempobase+=*nstrings;
 	for(l=0;l<*nstrings;l++)
 		tempobase[l]=optrarray[l];
 	k++;
 }

 /*
 ** All done...go home.  Pass local pointer back.
 */
 return(optrarray);
 }

 /**************
 ** stradjust **
 ***************
 ** Used by the string heap sort.  Call this routine to adjust the
 ** string at offset i to length l.  The members of the string array
 ** are moved accordingly and the length of the string at offset i
 ** is set to l.
 */
 static void stradjust(farulong *optrarray,      /* Offset pointer array */
 	faruchar *strarray,                     /* String array */
 	ulong nstrings,                         /* # of strings */
 	ulong i,                                /* Offset to adjust */
 	uchar l)                                /* New length */
 {
 unsigned long nbytes;           /* # of bytes to move */
 unsigned long j;                /* Index */
 int direction;                  /* Direction indicator */
 unsigned char adjamount;        /* Adjustment amount */

 /*
 ** If new length is less than old length, the direction is
 ** down.  If new length is greater than old length, the
 ** direction is up.
 */
 direction=(int)l - (int)*(strarray+*(optrarray+i));
 adjamount=(unsigned char)abs(direction);

 /*
 ** See if the adjustment is being made to the last
 ** string in the string array.  If so, we don't have to
 ** do anything more than adjust the length field.
 */
 if(i==(nstrings-1L))
 {       *(strarray+*(optrarray+i))=l;
 	return;
 }

 /*
 ** Calculate the total # of bytes in string array from
 ** location i+1 to end of array.  Whether we're moving "up" or
 ** down, this is how many bytes we'll have to move.
 */
 nbytes=*(optrarray+nstrings-1L) +
 	(unsigned long)*(strarray+*(optrarray+nstrings-1L)) + 1L -
 	*(optrarray+i+1L);

 /*
 ** Calculate the source and the destination.  Source is
 ** string position i+1.  Destination is string position i+l
 ** (i+"ell"...don't confuse 1 and l).
 ** Hand this straight to memmove and let it handle the
 ** "overlap" problem.
 */
 MoveMemory((farvoid *)(strarray+*(optrarray+i)+l+1),
 	(farvoid *)(strarray+*(optrarray+i+1)),
 	(unsigned long)nbytes);

 /*
 ** We have to adjust the offset pointer array.
 ** This covers string i+1 to numstrings-1.
 */
 for(j=i+1;j<nstrings;j++)
 	if(direction<0)
 		*(optrarray+j)=*(optrarray+j)-adjamount;
 	else
 		*(optrarray+j)=*(optrarray+j)+adjamount;

 /*
 ** Store the new length and go home.
 */
 *(strarray+*(optrarray+i))=l;
 return;
 }

 /****************
 ** strheapsort **
 *****************
 ** Pass this routine a pointer to an array of unsigned char.
 ** The array is presumed to hold strings occupying at most
 ** 80 bytes (counts a byte count).
 ** This routine also needs a pointer to an array of offsets
 ** which represent string locations in the array, and
 ** an unsigned long indicating the number of strings
 ** in the array.
 */
 static void StrHeapSort(farulong *optrarray, /* Offset pointers */
 	faruchar *strarray,             /* Strings array */
 	ulong numstrings,               /* # of strings in array */
 	ulong bottom,                   /* Region to sort...bottom */
 	ulong top)                      /* Region to sort...top */
 {
 unsigned char temp[80];                 /* Used to exchange elements */
 unsigned char tlen;                     /* Temp to hold length */
 unsigned long i;                        /* Loop index */


 /*
 ** Build a heap in the array
 */
 for(i=(top/2L); i>0; --i)
 	strsift(optrarray,strarray,numstrings,i,top);

 /*
 ** Repeatedly extract maximum from heap and place it at the
 ** end of the array.  When we get done, we'll have a sorted
 ** array.
 */
 for(i=top; i>0; --i)
 {
 	strsift(optrarray,strarray,numstrings,0,i);

 	/* temp = string[0] */
 	tlen=*strarray;
 	MoveMemory((farvoid *)&temp[0], /* Perform exchange */
 		(farvoid *)strarray,
 		(unsigned long)(tlen+1));


 	/* string[0]=string[i] */
 	tlen=*(strarray+*(optrarray+i));
 	stradjust(optrarray,strarray,numstrings,0,tlen);
 	MoveMemory((farvoid *)strarray,
 		(farvoid *)(strarray+*(optrarray+i)),
 		(unsigned long)(tlen+1));

 	/* string[i]=temp */
 	tlen=temp[0];
 	stradjust(optrarray,strarray,numstrings,i,tlen);
 	MoveMemory((farvoid *)(strarray+*(optrarray+i)),
 		(farvoid *)&temp[0],
 		(unsigned long)(tlen+1));

 }
 return;
 }

 /****************
 ** str_is_less **
 *****************
 ** Pass this function:
 **      1) A pointer to an array of offset pointers
 **      2) A pointer to a string array
 **      3) The number of elements in the string array
 **      4) Offsets to two strings (a & b)
 ** This function returns TRUE if string a is < string b.
 */
 static int str_is_less(farulong *optrarray, /* Offset pointers */
 	faruchar *strarray,                     /* String array */
 	ulong numstrings,                       /* # of strings */
 	ulong a, ulong b)                       /* Offsets */
 {
 int slen;               /* String length */

 /*
 ** Determine which string has the minimum length.  Use that
 ** to call strncmp().  If they match up to that point, the
 ** string with the longer length wins.
 */
 slen=(int)*(strarray+*(optrarray+a));
 if(slen > (int)*(strarray+*(optrarray+b)))
 	slen=(int)*(strarray+*(optrarray+b));

 slen=strncmp((char *)(strarray+*(optrarray+a)),
 		(char *)(strarray+*(optrarray+b)),slen);

 if(slen==0)
 {
 	/*
 	** They match.  Return true if the length of a
 	** is greater than the length of b.
 	*/
 	if(*(strarray+*(optrarray+a)) >
 		*(strarray+*(optrarray+b)))
 		return(TRUE);
 	return(FALSE);
 }

 if(slen<0) return(TRUE);        /* a is strictly less than b */

 return(FALSE);                  /* Only other possibility */
 }

 /************
 ** strsift **
 *************
 ** Pass this function:
 **      1) A pointer to an array of offset pointers
 **      2) A pointer to a string array
 **      3) The number of elements in the string array
 **      4) Offset within which to sort.
 ** Sift the array within the bounds of those offsets (thus
 ** building a heap).
 */
 static void strsift(farulong *optrarray,        /* Offset pointers */
 	faruchar *strarray,                     /* String array */
 	ulong numstrings,                       /* # of strings */
 	ulong i, ulong j)                       /* Offsets */
 {
 unsigned long k;                /* Temporaries */
 unsigned char temp[80];
 unsigned char tlen;             /* For string lengths */


 while((i+i)<=j)
 {
 	k=i+i;
 	if(k<j)
 		if(str_is_less(optrarray,strarray,numstrings,k,k+1L))
 			++k;
 	if(str_is_less(optrarray,strarray,numstrings,i,k))
 	{
 		/* temp=string[k] */
 		tlen=*(strarray+*(optrarray+k));
 		MoveMemory((farvoid *)&temp[0],
 			(farvoid *)(strarray+*(optrarray+k)),
 			(unsigned long)(tlen+1));

 		/* string[k]=string[i] */
 		tlen=*(strarray+*(optrarray+i));
 		stradjust(optrarray,strarray,numstrings,k,tlen);
 		MoveMemory((farvoid *)(strarray+*(optrarray+k)),
 			(farvoid *)(strarray+*(optrarray+i)),
 			(unsigned long)(tlen+1));

 		/* string[i]=temp */
 		tlen=temp[0];
 		stradjust(optrarray,strarray,numstrings,i,tlen);
 		MoveMemory((farvoid *)(strarray+*(optrarray+i)),
 			(farvoid *)&temp[0],
 			(unsigned long)(tlen+1));
 		i=k;
 	}
 	else
 		i=j+1;
 }
 return;
 }

 /************************
 ** BITFIELD OPERATIONS **
 *************************/

 /*************
 ** DoBitops **
 **************
 ** Perform the bit operations test portion of the CPU
 ** benchmark.  Returns the iterations per second.
 */
 void DoBitops(void)
 {
 BitOpStruct *locbitopstruct;    /* Local bitop structure */
 farulong *bitarraybase;         /* Base of bitmap array */
 farulong *bitoparraybase;       /* Base of bitmap operations array */
 ulong nbitops;                  /* # of bitfield operations */
 ulong accumtime;                /* Accumulated time in ticks */
 double iterations;              /* # of iterations */
 char *errorcontext;             /* Error context string */
 int systemerror;                /* For holding error codes */
 int ticks;

 /*
 ** Link to global structure.
 */
 locbitopstruct=&global_bitopstruct;

 /*
 ** Set the error context.
 */
 errorcontext="CPU:Bitfields";

 /*
 ** See if we need to run adjustment code.
 */
 if(locbitopstruct->adjust==0)
 {
 	bitarraybase=(farulong *)AllocateMemory(locbitopstruct->bitfieldarraysize *
 		sizeof(ulong),&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		ErrorExit();
 	}

 	/*
 	** Initialize bitfield operations array to [2,30] elements
 	*/
 	locbitopstruct->bitoparraysize=30L;

 	while(1)
 	{
 		/*
 		** Allocate space for operations array
 		*/
 		bitoparraybase=(farulong *)AllocateMemory(locbitopstruct->bitoparraysize*2L*
 			sizeof(ulong),
 			&systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			FreeMemory((farvoid *)bitarraybase,&systemerror);
 			ErrorExit();
 		}
 		/*
 		** Do an iteration of the bitmap test.  If the
 		** elapsed time is less than or equal to the permitted
 		** minimum, then de-allocate the array, reallocate a
 		** larger version, and try again.
 		*/
 		ticks=DoBitfieldIteration(bitarraybase,
 					   bitoparraybase,
 					   locbitopstruct->bitoparraysize,
 					   &nbitops);
 #ifdef DEBUG
 #ifdef LINUX
 	        if (locbitopstruct->bitoparraysize==30L){
 		  /* this is the first loop, write a debug file */
 		  FILE *file;
 		  unsigned long *running_base; /* same as farulong */
 		  long counter;
 		  file=fopen("debugbit.dat","w");
 		  running_base=bitarraybase;
 		  for (counter=0;counter<(long)(locbitopstruct->bitfieldarraysize);counter++){
 if (sizeof(long) == 8) {
 		    fprintf(file,"%08X",(unsigned int)(*running_base&0xFFFFFFFFL));
 		    fprintf(file,"%08X",(unsigned int)((*running_base>>32)&0xFFFFFFFFL));
 		    if ((counter+1)%4==0) fprintf(file,"\n");
 } else {
 		    fprintf(file,"%08lX",*running_base);
 		    if ((counter+1)%8==0) fprintf(file,"\n");
 }
 		    running_base=running_base+1;
 		  }
 		  fclose(file);
 		  printf("\nWrote the file debugbit.dat, you may want to compare it to debugbit.good\n");
 		}
 #endif
 #endif

 		if (ticks>global_min_ticks) break;      /* We're ok...exit */

 		FreeMemory((farvoid *)bitoparraybase,&systemerror);
 		locbitopstruct->bitoparraysize+=100L;
 	}
 }
 else
 {
 	/*
 	** Don't need to do self adjustment, just allocate
 	** the array space.
 	*/
 	bitarraybase=(farulong *)AllocateMemory(locbitopstruct->bitfieldarraysize *
 		sizeof(ulong),&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		ErrorExit();
 	}
 	bitoparraybase=(farulong *)AllocateMemory(locbitopstruct->bitoparraysize*2L*
 		sizeof(ulong),
 		&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		FreeMemory((farvoid *)bitarraybase,&systemerror);
 		ErrorExit();
 	}
 }

 /*
 ** All's well if we get here.  Repeatedly perform bitops until the
 ** accumulated elapsed time is greater than # of seconds requested.
 */
 accumtime=0L;
 iterations=(double)0.0;
 do {
 	accumtime+=DoBitfieldIteration(bitarraybase,
 			bitoparraybase,
 			locbitopstruct->bitoparraysize,&nbitops);
 	iterations+=(double)nbitops;
 } while(iterations < BASE_ITERATIONS * 50000);

 /*
 ** Clean up, calculate results, and go home.
 ** Also, set adjustment flag to show that we don't have
 ** to do self adjusting in the future.
 */
 FreeMemory((farvoid *)bitarraybase,&systemerror);
 FreeMemory((farvoid *)bitoparraybase,&systemerror);
 locbitopstruct->bitopspersec=iterations /TicksToFracSecs(accumtime);
 if(locbitopstruct->adjust==0)
 	locbitopstruct->adjust=1;

 return;
 }

 /************************
 ** DoBitfieldIteration **
 *************************
 ** Perform a single iteration of the bitfield benchmark.
 ** Return the # of ticks accumulated by the operation.
 */
 static ulong DoBitfieldIteration(farulong *bitarraybase,
 		farulong *bitoparraybase,
 		long bitoparraysize,
 		ulong *nbitops)
 {
 long i;                         /* Index */
 ulong bitoffset;                /* Offset into bitmap */
 ulong elapsed;                  /* Time to execute */
 /*
 ** Clear # bitops counter
 */
 *nbitops=0L;

 /*
 ** Construct a set of bitmap offsets and run lengths.
 ** The offset can be any random number from 0 to the
 ** size of the bitmap (in bits).  The run length can
 ** be any random number from 1 to the number of bits
 ** between the offset and the end of the bitmap.
 ** Note that the bitmap has 8192 * 32 bits in it.
 ** (262,144 bits)
 */
 /*
 ** Reset random number generator so things repeat.
 ** Also reset the bit array we work on.
 ** added by Uwe F. Mayer
 */
 randnum((int32)13);
 for (i=0;i<global_bitopstruct.bitfieldarraysize;i++)
 {
 if (sizeof(long) == 8)
 	*(bitarraybase+i)=(ulong)0x5555555555555555;
 else
 	*(bitarraybase+i)=(ulong)0x55555555;
 }
 randnum((int32)13);
 /* end of addition of code */

 for (i=0;i<bitoparraysize;i++)
 {
 	/* First item is offset */
         /* *(bitoparraybase+i+i)=bitoffset=abs_randwc(262140L); */
 	*(bitoparraybase+i+i)=bitoffset=abs_randwc((int32)262140);

 	/* Next item is run length */
 	/* *nbitops+=*(bitoparraybase+i+i+1L)=abs_randwc(262140L-bitoffset);*/
 	*nbitops+=*(bitoparraybase+i+i+1L)=abs_randwc((int32)262140-bitoffset);
 }

 /*
 ** Array of offset and lengths built...do an iteration of
 ** the test.
 ** Start the stopwatch.
 */
 elapsed=StartStopwatch();

 /*
 ** Loop through array off offset/run length pairs.
 ** Execute operation based on modulus of index.
 */
 for(i=0;i<bitoparraysize;i++)
 {
 	switch(i % 3)
 	{

 		case 0: /* Set run of bits */
 			ToggleBitRun(bitarraybase,
 				*(bitoparraybase+i+i),
 				*(bitoparraybase+i+i+1),
 				1);
 			break;

 		case 1: /* Clear run of bits */
 			ToggleBitRun(bitarraybase,
 				*(bitoparraybase+i+i),
 				*(bitoparraybase+i+i+1),
 				0);
 			break;

 		case 2: /* Complement run of bits */
 			FlipBitRun(bitarraybase,
 				*(bitoparraybase+i+i),
 				*(bitoparraybase+i+i+1));
 			break;
 	}
 }

 /*
 ** Return elapsed time
 */
 return(StopStopwatch(elapsed));
 }


 /*****************************
 **     ToggleBitRun          *
 ******************************
 ** Set or clear a run of nbits starting at
 ** bit_addr in bitmap.
 */
 static void ToggleBitRun(farulong *bitmap, /* Bitmap */
 		ulong bit_addr,         /* Address of bits to set */
 		ulong nbits,            /* # of bits to set/clr */
 		uint val)               /* 1 or 0 */
 {
 unsigned long bindex;   /* Index into array */
 unsigned long bitnumb;  /* Bit number */

 while(nbits--)
 {
 if (sizeof(long) == 8) {
 	bindex=bit_addr>>6;     /* Index is number /64 */
 	bitnumb=bit_addr % 64;   /* Bit number in word */
 } else {
 	bindex=bit_addr>>5;     /* Index is number /32 */
 	bitnumb=bit_addr % 32;  /* bit number in word */
 }
 	if(val)
 		bitmap[bindex]|=(1L<<bitnumb);
 	else
 		bitmap[bindex]&=~(1L<<bitnumb);
 	bit_addr++;
 }
 return;
 }

 /***************
 ** FlipBitRun **
 ****************
 ** Complements a run of bits.
 */
 static void FlipBitRun(farulong *bitmap,        /* Bit map */
 		ulong bit_addr,                 /* Bit address */
 		ulong nbits)                    /* # of bits to flip */
 {
 unsigned long bindex;   /* Index into array */
 unsigned long bitnumb;  /* Bit number */

 while(nbits--)
 {
 if (sizeof(long) == 8) {
 	bindex=bit_addr>>6;     /* Index is number /64 */
 	bitnumb=bit_addr % 64;  /* Bit number in longword */
 } else {
 	bindex=bit_addr>>5;     /* Index is number /32 */
 	bitnumb=bit_addr % 32;  /* Bit number in longword */
 }
 	bitmap[bindex]^=(1L<<bitnumb);
 	bit_addr++;
 }

 return;
 }

 /*****************************
 ** FLOATING-POINT EMULATION **
 *****************************/

 /**************
 ** DoEmFloat **
 ***************
 ** Perform the floating-point emulation routines portion of the
 ** CPU benchmark.  Returns the operations per second.
 */
 void DoEmFloat(void)
 {
 EmFloatStruct *locemfloatstruct;        /* Local structure */
 InternalFPF *abase;             /* Base of A array */
 InternalFPF *bbase;             /* Base of B array */
 InternalFPF *cbase;             /* Base of C array */
 ulong accumtime;                /* Accumulated time in ticks */
 double iterations;              /* # of iterations */
 ulong tickcount;                /* # of ticks */
 char *errorcontext;             /* Error context string pointer */
 int systemerror;                /* For holding error code */
 ulong loops;                    /* # of loops */

 /*
 ** Link to global structure
 */
 locemfloatstruct=&global_emfloatstruct;

 /*
 ** Set the error context
 */
 errorcontext="CPU:Floating Emulation";


 /*
 ** Test the emulation routines.
 */
 #ifdef DEBUG
 #endif

 abase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF),
 		&systemerror);
 if(systemerror)
 {       ReportError(errorcontext,systemerror);
 	ErrorExit();
 }

 bbase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF),
 		&systemerror);
 if(systemerror)
 {       ReportError(errorcontext,systemerror);
 	FreeMemory((farvoid *)abase,&systemerror);
 	ErrorExit();
 }

 cbase=(InternalFPF *)AllocateMemory(locemfloatstruct->arraysize*sizeof(InternalFPF),
 		&systemerror);
 if(systemerror)
 {       ReportError(errorcontext,systemerror);
 	FreeMemory((farvoid *)abase,&systemerror);
 	FreeMemory((farvoid *)bbase,&systemerror);
 	ErrorExit();
 }

 /*
 ** Set up the arrays
 */
 SetupCPUEmFloatArrays(abase,bbase,cbase,locemfloatstruct->arraysize);

 /*
 ** See if we need to do self-adjusting code.
 */
 if(locemfloatstruct->adjust==0)
 {
 	locemfloatstruct->loops=0;

 	/*
 	** Do an iteration of the tests.  If the elapsed time is
 	** less than minimum, increase the loop count and try
 	** again.
 	*/
 	for(loops=1;loops<CPUEMFLOATLOOPMAX;loops+=loops)
 	{       tickcount=DoEmFloatIteration(abase,bbase,cbase,
 			locemfloatstruct->arraysize,
 			loops);
 		if(tickcount>global_min_ticks)
 		{       locemfloatstruct->loops=loops;
 			break;
 		}
 	}
 }

 /*
 ** Verify that selft adjustment code worked.
 */
 if(locemfloatstruct->loops==0)
 {       printf("CPU:EMFPU -- CMPUEMFLOATLOOPMAX limit hit\n");
 	FreeMemory((farvoid *)abase,&systemerror);
 	FreeMemory((farvoid *)bbase,&systemerror);
 	FreeMemory((farvoid *)cbase,&systemerror);
 	ErrorExit();
 }

 /*
 ** All's well if we get here.  Repeatedly perform floating
 ** tests until the accumulated time is greater than the
 ** # of seconds requested.
 ** Each iteration performs arraysize * 3 operations.
 */
 accumtime=0L;
 iterations=(double)0.0;
 do {
 	accumtime+=DoEmFloatIteration(abase,bbase,cbase,
 			locemfloatstruct->arraysize,
 			locemfloatstruct->loops);
 	iterations+=(double)1.0;
 } while(iterations < BASE_ITERATIONS * 3);


 /*
 ** Clean up, calculate results, and go home.
 ** Also, indicate that adjustment is done.
 */
 FreeMemory((farvoid *)abase,&systemerror);
 FreeMemory((farvoid *)bbase,&systemerror);
 FreeMemory((farvoid *)cbase,&systemerror);

 locemfloatstruct->emflops=(iterations*(double)locemfloatstruct->loops)/
 		(double)TicksToFracSecs(accumtime);
 if(locemfloatstruct->adjust==0)
 	locemfloatstruct->adjust=1;

 #ifdef DEBUG
 printf("----------------------------------------------------------------------------\n");
 #endif
 return;
 }

 /*************************
 ** FOURIER COEFFICIENTS **
 *************************/

 /**************
 ** DoFourier **
 ***************
 ** Perform the transcendental/trigonometric portion of the
 ** benchmark.  This benchmark calculates the first n
 ** fourier coefficients of the function (x+1)^x defined
 ** on the interval 0,2.
 */
 void DoFourier(void)
 {
 FourierStruct *locfourierstruct;        /* Local fourier struct */
 fardouble *abase;               /* Base of A[] coefficients array */
 fardouble *bbase;               /* Base of B[] coefficients array */
 unsigned long accumtime;        /* Accumulated time in ticks */
 double iterations;              /* # of iterations */
 char *errorcontext;             /* Error context string pointer */
 int systemerror;                /* For error code */

 /*
 ** Link to global structure
 */
 locfourierstruct=&global_fourierstruct;

 /*
 ** Set error context string
 */
 errorcontext="FPU:Transcendental";

 /*
 ** See if we need to do self-adjustment code.
 */
 if(locfourierstruct->adjust==0)
 {
 	locfourierstruct->arraysize=100L;       /* Start at 100 elements */
 	while(1)
 	{

 		abase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double),
 				&systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			ErrorExit();
 		}

 		bbase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double),
 				&systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			FreeMemory((void *)abase,&systemerror);
 			ErrorExit();
 		}
 		/*
 		** Do an iteration of the tests.  If the elapsed time is
 		** less than or equal to the permitted minimum, re-allocate
 		** larger arrays and try again.
 		*/
 		if(DoFPUTransIteration(abase,bbase,
 			locfourierstruct->arraysize)>global_min_ticks)
 			break;          /* We're ok...exit */

 		/*
 		** Make bigger arrays and try again.
 		*/
 		FreeMemory((farvoid *)abase,&systemerror);
 		FreeMemory((farvoid *)bbase,&systemerror);
 		locfourierstruct->arraysize+=50L;
 	}
 }
 else
 {       /*
 	** Don't need self-adjustment.  Just allocate the
 	** arrays, and go.
 	*/
 	abase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double),
 			&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		ErrorExit();
 	}

 	bbase=(fardouble *)AllocateMemory(locfourierstruct->arraysize*sizeof(double),
 			&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		FreeMemory((void *)abase,&systemerror);
 		ErrorExit();
 	}
 }
 /*
 ** All's well if we get here.  Repeatedly perform integration
 ** tests until the accumulated time is greater than the
 ** # of seconds requested.
 */
 accumtime=0L;
 iterations=(double)0.0;
 do {
 	accumtime+=DoFPUTransIteration(abase,bbase,locfourierstruct->arraysize);
 	iterations+=(double)locfourierstruct->arraysize*(double)2.0-(double)1.0;
 } while(iterations < BASE_ITERATIONS * 400);


 /*
 ** Clean up, calculate results, and go home.
 ** Also set adjustment flag to indicate no adjust code needed.
 */
 FreeMemory((farvoid *)abase,&systemerror);
 FreeMemory((farvoid *)bbase,&systemerror);

 locfourierstruct->fflops=iterations/(double)TicksToFracSecs(accumtime);

 if(locfourierstruct->adjust==0)
 	locfourierstruct->adjust=1;

 return;
 }

 /************************
 ** DoFPUTransIteration **
 *************************
 ** Perform an iteration of the FPU Transcendental/trigonometric
 ** benchmark.  Here, an iteration consists of calculating the
 ** first n fourier coefficients of the function (x+1)^x on
 ** the interval 0,2.  n is given by arraysize.
 ** NOTE: The # of integration steps is fixed at
 ** 200.
 */
 static ulong DoFPUTransIteration(fardouble *abase,      /* A coeffs. */
 			fardouble *bbase,               /* B coeffs. */
 			ulong arraysize)                /* # of coeffs */
 {
 double omega;           /* Fundamental frequency */
 unsigned long i;        /* Index */
 unsigned long elapsed;  /* Elapsed time */

 /*
 ** Start the stopwatch
 */
 elapsed=StartStopwatch();

 /*
 ** Calculate the fourier series.  Begin by
 ** calculating A[0].
 */

 *abase=TrapezoidIntegrate((double)0.0,
 			(double)2.0,
 			200,
 			(double)0.0,    /* No omega * n needed */
 			0 )/(double)2.0;

 /*
 ** Calculate the fundamental frequency.
 ** ( 2 * pi ) / period...and since the period
 ** is 2, omega is simply pi.
 */
 omega=(double)3.1415926535897932;

 for(i=1;i<arraysize;i++)
 {

 	/*
 	** Calculate A[i] terms.  Note, once again, that we
 	** can ignore the 2/period term outside the integral
 	** since the period is 2 and the term cancels itself
 	** out.
 	*/
 	*(abase+i)=TrapezoidIntegrate((double)0.0,
 			(double)2.0,
 			200,
 			omega * (double)i,
 			1);

 	/*
 	** Calculate the B[i] terms.
 	*/
 	*(bbase+i)=TrapezoidIntegrate((double)0.0,
 			(double)2.0,
 			200,
 			omega * (double)i,
 			2);

 }
 #ifdef DEBUG
 {
   int i;
   printf("\nA[i]=\n");
   for (i=0;i<arraysize;i++) printf("%7.3g ",abase[i]);
   printf("\nB[i]=\n(undefined) ");
   for (i=1;i<arraysize;i++) printf("%7.3g ",bbase[i]);
 }
 #endif
 /*
 ** All done, stop the stopwatch
 */
 return(StopStopwatch(elapsed));
 }

 /***********************
 ** TrapezoidIntegrate **
 ************************
 ** Perform a simple trapezoid integration on the
 ** function (x+1)**x.
 ** x0,x1 set the lower and upper bounds of the
 ** integration.
 ** nsteps indicates # of trapezoidal sections
 ** omegan is the fundamental frequency times
 **  the series member #
 ** select = 0 for the A[0] term, 1 for cosine terms, and
 **   2 for sine terms.
 ** Returns the value.
 */
 static double TrapezoidIntegrate( double x0,            /* Lower bound */
 			double x1,              /* Upper bound */
 			int nsteps,             /* # of steps */
 			double omegan,          /* omega * n */
 			int select)
 {
 double x;               /* Independent variable */
 double dx;              /* Stepsize */
 double rvalue;          /* Return value */


 /*
 ** Initialize independent variable
 */
 x=x0;

 /*
 ** Calculate stepsize
 */
 dx=(x1 - x0) / (double)nsteps;

 /*
 ** Initialize the return value.
 */
 rvalue=thefunction(x0,omegan,select)/(double)2.0;

 /*
 ** Compute the other terms of the integral.
 */
 if(nsteps!=1)
 {       --nsteps;               /* Already done 1 step */
 	while(--nsteps )
 	{
 		x+=dx;
 		rvalue+=thefunction(x,omegan,select);
 	}
 }
 /*
 ** Finish computation
 */
 rvalue=(rvalue+thefunction(x1,omegan,select)/(double)2.0)*dx;

 return(rvalue);
 }

 /****************
 ** thefunction **
 *****************
 ** This routine selects the function to be used
 ** in the Trapezoid integration.
 ** x is the independent variable
 ** omegan is omega * n
 ** select chooses which of the sine/cosine functions
 **  are used.  note the special case for select=0.
 */
 static double thefunction(double x,             /* Independent variable */
 		double omegan,          /* Omega * term */
 		int select)             /* Choose term */
 {

 /*
 ** Use select to pick which function we call.
 */
 switch(select)
 {
 	case 0: return(pow(x+(double)1.0,x));

 	case 1: return(pow(x+(double)1.0,x) * cos(omegan * x));

 	case 2: return(pow(x+(double)1.0,x) * sin(omegan * x));
 }

 /*
 ** We should never reach this point, but the following
 ** keeps compilers from issuing a warning message.
 */
 return(0.0);
 }

 /*************************
 ** ASSIGNMENT ALGORITHM **
 *************************/

 /*************
 ** DoAssign **
 **************
 ** Perform an assignment algorithm.
 ** The algorithm was adapted from the step by step guide found
 ** in "Quantitative Decision Making for Business" (Gordon,
 **  Pressman, and Cohn; Prentice-Hall)
 **
 **
 ** NOTES:
 ** 1. Even though the algorithm distinguishes between
 **    ASSIGNROWS and ASSIGNCOLS, as though the two might
 **    be different, it does presume a square matrix.
 **    I.E., ASSIGNROWS and ASSIGNCOLS must be the same.
 **    This makes for some algorithmically-correct but
 **    probably non-optimal constructs.
 **
 */
 void DoAssign(void)
 {
 AssignStruct *locassignstruct;  /* Local structure ptr */
 farlong *arraybase;
 char *errorcontext;
 int systemerror;
 ulong accumtime;
 double iterations;

 /*
 ** Link to global structure
 */
 locassignstruct=&global_assignstruct;

 /*
 ** Set the error context string.
 */
 errorcontext="CPU:Assignment";

 /*
 ** See if we need to do self adjustment code.
 */
 if(locassignstruct->adjust==0)
 {
 	/*
 	** Self-adjustment code.  The system begins by working on 1
 	** array.  If it does that in no time, then two arrays
 	** are built.  This process continues until
 	** enough arrays are built to handle the tolerance.
 	*/
 	locassignstruct->numarrays=1;
 	while(1)
 	{
 		/*
 		** Allocate space for arrays
 		*/
 		arraybase=(farlong *) AllocateMemory(sizeof(long)*
 			ASSIGNROWS*ASSIGNCOLS*locassignstruct->numarrays,
 			 &systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			FreeMemory((farvoid *)arraybase,
 			  &systemerror);
 			ErrorExit();
 		}

 		/*
 		** Do an iteration of the assignment alg.  If the
 		** elapsed time is less than or equal to the permitted
 		** minimum, then allocate for more arrays and
 		** try again.
 		*/
 		if(DoAssignIteration(arraybase,
 			locassignstruct->numarrays)>global_min_ticks)
 			break;          /* We're ok...exit */

 		FreeMemory((farvoid *)arraybase, &systemerror);
 		locassignstruct->numarrays++;
 	}
 }
 else
 {       /*
 	** Allocate space for arrays
 	*/
 	arraybase=(farlong *)AllocateMemory(sizeof(long)*
 		ASSIGNROWS*ASSIGNCOLS*locassignstruct->numarrays,
 		 &systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		FreeMemory((farvoid *)arraybase,
 		  &systemerror);
 		ErrorExit();
 	}
 }

 /*
 ** All's well if we get here.  Do the tests.
 */
 accumtime=0L;
 iterations=(double)0.0;

 do {
 	accumtime+=DoAssignIteration(arraybase,
 		locassignstruct->numarrays);
 	iterations+=(double)1.0;
 } while(iterations < BASE_ITERATIONS/3);

 /*
 ** Clean up, calculate results, and go home.  Be sure to
 ** show that we don't have to rerun adjustment code.
 */
 FreeMemory((farvoid *)arraybase,&systemerror);

 locassignstruct->iterspersec=iterations *
 	(double)locassignstruct->numarrays / TicksToFracSecs(accumtime);

 if(locassignstruct->adjust==0)
 	locassignstruct->adjust=1;

 return;

 }

 /**********************
 ** DoAssignIteration **
 ***********************
 ** This routine executes one iteration of the assignment test.
 ** It returns the number of ticks elapsed in the iteration.
 */
 static ulong DoAssignIteration(farlong *arraybase,
 	ulong numarrays)
 {
 longptr abase;                  /* local pointer */
 ulong elapsed;          /* Elapsed ticks */
 ulong i;

 /*
 ** Set up local pointer
 */
 abase.ptrs.p=arraybase;

 /*
 ** Load up the arrays with a random table.
 */
 LoadAssignArrayWithRand(arraybase,numarrays);

 /*
 ** Start the stopwatch
 */
 elapsed=StartStopwatch();

 /*
 ** Execute assignment algorithms
 */
 for(i=0;i<numarrays;i++)
 {       /* abase.ptrs.p+=i*ASSIGNROWS*ASSIGNCOLS; */
         /* Fixed  by Eike Dierks */
 	Assignment(*abase.ptrs.ap);
 	abase.ptrs.p+=ASSIGNROWS*ASSIGNCOLS;
 }

 /*
 ** Get elapsed time
 */
 return(StopStopwatch(elapsed));
 }

 /****************************
 ** LoadAssignArrayWithRand **
 *****************************
 ** Load the assignment arrays with random numbers.  All positive.
 ** These numbers represent costs.
 */
 static void LoadAssignArrayWithRand(farlong *arraybase,
 	ulong numarrays)
 {
 longptr abase,abase1;   /* Local for array pointer */
 ulong i;

 /*
 ** Set local array pointer
 */
 abase.ptrs.p=arraybase;
 abase1.ptrs.p=arraybase;

 /*
 ** Set up the first array.  Then just copy it into the
 ** others.
 */
 LoadAssign(*(abase.ptrs.ap));
 if(numarrays>1)
 	for(i=1;i<numarrays;i++)
 	  {     /* abase1.ptrs.p+=i*ASSIGNROWS*ASSIGNCOLS; */
 	        /* Fixed  by Eike Dierks */
 	        abase1.ptrs.p+=ASSIGNROWS*ASSIGNCOLS;
 		CopyToAssign(*(abase.ptrs.ap),*(abase1.ptrs.ap));
 	}

 return;
 }

 /***************
 ** LoadAssign **
 ****************
 ** The array given by arraybase is loaded with positive random
 ** numbers.  Elements in the array are capped at 5,000,000.
 */
 static void LoadAssign(farlong arraybase[][ASSIGNCOLS])
 {
 ushort i,j;

 /*
 ** Reset random number generator so things repeat.
 */
 /* randnum(13L); */
 randnum((int32)13);

 for(i=0;i<ASSIGNROWS;i++)
   for(j=0;j<ASSIGNROWS;j++){
     /* arraybase[i][j]=abs_randwc(5000000L);*/
     arraybase[i][j]=abs_randwc((int32)5000000);
   }

 return;
 }

 /*****************
 ** CopyToAssign **
 ******************
 ** Copy the contents of one array to another.  This is called by
 ** the routine that builds the initial array, and is used to copy
 ** the contents of the intial array into all following arrays.
 */
 static void CopyToAssign(farlong arrayfrom[ASSIGNROWS][ASSIGNCOLS],
 		farlong arrayto[ASSIGNROWS][ASSIGNCOLS])
 {
 ushort i,j;

 for(i=0;i<ASSIGNROWS;i++)
 	for(j=0;j<ASSIGNCOLS;j++)
 		arrayto[i][j]=arrayfrom[i][j];

 return;
 }

 /***************
 ** Assignment **
 ***************/
 static void Assignment(farlong arraybase[][ASSIGNCOLS])
 {
 short assignedtableau[ASSIGNROWS][ASSIGNCOLS];

 /*
 ** First, calculate minimum costs
 */
 calc_minimum_costs(arraybase);

 /*
 ** Repeat following until the number of rows selected
 ** equals the number of rows in the tableau.
 */
 while(first_assignments(arraybase,assignedtableau)!=ASSIGNROWS)
 {         second_assignments(arraybase,assignedtableau);
 }

 #ifdef DEBUG
 {
 	int i,j;
 	printf("\nColumn choices for each row\n");
 	for(i=0;i<ASSIGNROWS;i++)
 	{
 	        printf("R%03d: ",i);
 		for(j=0;j<ASSIGNCOLS;j++)
 			if(assignedtableau[i][j]==1)
 				printf("%03d ",j);
 	}
 }
 #endif

 return;
 }

 /***********************
 ** calc_minimum_costs **
 ************************
 ** Revise the tableau by calculating the minimum costs on a
 ** row and column basis.  These minima are subtracted from
 ** their rows and columns, creating a new tableau.
 */
 static void calc_minimum_costs(long tableau[][ASSIGNCOLS])
 {
 ushort i,j;              /* Index variables */
 long currentmin;        /* Current minimum */
 /*
 ** Determine minimum costs on row basis.  This is done by
 ** subtracting -- on a row-per-row basis -- the minum value
 ** for that row.
 */
 for(i=0;i<ASSIGNROWS;i++)
 {
 	currentmin=MAXPOSLONG;  /* Initialize minimum */
 	for(j=0;j<ASSIGNCOLS;j++)
 		if(tableau[i][j]<currentmin)
 			currentmin=tableau[i][j];

 	for(j=0;j<ASSIGNCOLS;j++)
 		tableau[i][j]-=currentmin;
 }

 /*
 ** Determine minimum cost on a column basis.  This works
 ** just as above, only now we step through the array
 ** column-wise
 */
 for(j=0;j<ASSIGNCOLS;j++)
 {
 	currentmin=MAXPOSLONG;  /* Initialize minimum */
 	for(i=0;i<ASSIGNROWS;i++)
 		if(tableau[i][j]<currentmin)
 			currentmin=tableau[i][j];

 	/*
 	** Here, we'll take the trouble to see if the current
 	** minimum is zero.  This is likely worth it, since the
 	** preceding loop will have created at least one zero in
 	** each row.  We can save ourselves a few iterations.
 	*/
 	if(currentmin!=0)
 		for(i=0;i<ASSIGNROWS;i++)
 			tableau[i][j]-=currentmin;
 }

 return;
 }

 /**********************
 ** first_assignments **
 ***********************
 ** Do first assignments.
 ** The assignedtableau[] array holds a set of values that
 ** indicate the assignment of a value, or its elimination.
 ** The values are:
 **      0 = Item is neither assigned nor eliminated.
 **      1 = Item is assigned
 **      2 = Item is eliminated
 ** Returns the number of selections made.  If this equals
 ** the number of rows, then an optimum has been determined.
 */
 static int first_assignments(long tableau[][ASSIGNCOLS],
 		short assignedtableau[][ASSIGNCOLS])
 {
 ushort i,j,k;                   /* Index variables */
 ushort numassigns;              /* # of assignments */
 ushort totnumassigns;           /* Total # of assignments */
 ushort numzeros;                /* # of zeros in row */
 int selected=0;                 /* Flag used to indicate selection */

 /*
 ** Clear the assignedtableau, setting all members to show that
 ** no one is yet assigned, eliminated, or anything.
 */
 for(i=0;i<ASSIGNROWS;i++)
 	for(j=0;j<ASSIGNCOLS;j++)
 		assignedtableau[i][j]=0;

 totnumassigns=0;
 do {
 	numassigns=0;
 	/*
 	** Step through rows.  For each one that is not currently
 	** assigned, see if the row has only one zero in it.  If so,
 	** mark that as an assigned row/col.  Eliminate other zeros
 	** in the same column.
 	*/
 	for(i=0;i<ASSIGNROWS;i++)
 	{       numzeros=0;
 		for(j=0;j<ASSIGNCOLS;j++)
 			if(tableau[i][j]==0L)
 				if(assignedtableau[i][j]==0)
 				{       numzeros++;
 					selected=j;
 				}
 		if(numzeros==1)
 		{       numassigns++;
 			totnumassigns++;
 			assignedtableau[i][selected]=1;
 			for(k=0;k<ASSIGNROWS;k++)
 				if((k!=i) &&
 				   (tableau[k][selected]==0))
 					assignedtableau[k][selected]=2;
 		}
 	}
 	/*
 	** Step through columns, doing same as above.  Now, be careful
 	** of items in the other rows of a selected column.
 	*/
 	for(j=0;j<ASSIGNCOLS;j++)
 	{       numzeros=0;
 		for(i=0;i<ASSIGNROWS;i++)
 			if(tableau[i][j]==0L)
 				if(assignedtableau[i][j]==0)
 				{       numzeros++;
 					selected=i;
 				}
 		if(numzeros==1)
 		{       numassigns++;
 			totnumassigns++;
 			assignedtableau[selected][j]=1;
 			for(k=0;k<ASSIGNCOLS;k++)
 				if((k!=j) &&
 				   (tableau[selected][k]==0))
 					assignedtableau[selected][k]=2;
 		}
 	}
 	/*
 	** Repeat until no more assignments to be made.
 	*/
 } while(numassigns!=0);

 /*
 ** See if we can leave at this point.
 */
 if(totnumassigns==ASSIGNROWS) return(totnumassigns);

 /*
 ** Now step through the array by row.  If you find any unassigned
 ** zeros, pick the first in the row.  Eliminate all zeros from
 ** that same row & column.  This occurs if there are multiple optima...
 ** possibly.
 */
 for(i=0;i<ASSIGNROWS;i++)
 {       selected=-1;
 	for(j=0;j<ASSIGNCOLS;j++)
 		if((tableau[i][j]==0L) &&
 		   (assignedtableau[i][j]==0))
 		{       selected=j;
 			break;
 		}
 	if(selected!=-1)
 	{       assignedtableau[i][selected]=1;
 		totnumassigns++;
 		for(k=0;k<ASSIGNCOLS;k++)
 			if((k!=selected) &&
 			   (tableau[i][k]==0L))
 				assignedtableau[i][k]=2;
 		for(k=0;k<ASSIGNROWS;k++)
 			if((k!=i) &&
 			   (tableau[k][selected]==0L))
 				assignedtableau[k][selected]=2;
 	}
 }

 return(totnumassigns);
 }

 /***********************
 ** second_assignments **
 ************************
 ** This section of the algorithm creates the revised
 ** tableau, and is difficult to explain.  I suggest you
 ** refer to the algorithm's source, mentioned in comments
 ** toward the beginning of the program.
 */
 static void second_assignments(long tableau[][ASSIGNCOLS],
 		short assignedtableau[][ASSIGNCOLS])
 {
 int i,j;                                /* Indexes */
 short linesrow[ASSIGNROWS];
 short linescol[ASSIGNCOLS];
 long smallest;                          /* Holds smallest value */
 ushort numassigns;                      /* Number of assignments */
 ushort newrows;                         /* New rows to be considered */
 /*
 ** Clear the linesrow and linescol arrays.
 */
 for(i=0;i<ASSIGNROWS;i++)
 	linesrow[i]=0;
 for(i=0;i<ASSIGNCOLS;i++)
 	linescol[i]=0;

 /*
 ** Scan rows, flag each row that has no assignment in it.
 */
 for(i=0;i<ASSIGNROWS;i++)
 {       numassigns=0;
 	for(j=0;j<ASSIGNCOLS;j++)
 		if(assignedtableau[i][j]==1)
 		{       numassigns++;
 			break;
 		}
 	if(numassigns==0) linesrow[i]=1;
 }

 do {

 	newrows=0;
 	/*
 	** For each row checked above, scan for any zeros.  If found,
 	** check the associated column.
 	*/
 	for(i=0;i<ASSIGNROWS;i++)
 	{       if(linesrow[i]==1)
 			for(j=0;j<ASSIGNCOLS;j++)
 				if(tableau[i][j]==0)
 					linescol[j]=1;
 	}

 	/*
 	** Now scan checked columns.  If any contain assigned zeros, check
 	** the associated row.
 	*/
 	for(j=0;j<ASSIGNCOLS;j++)
 		if(linescol[j]==1)
 			for(i=0;i<ASSIGNROWS;i++)
 				if((assignedtableau[i][j]==1) &&
 					(linesrow[i]!=1))
 				{
 					linesrow[i]=1;
 					newrows++;
 				}
 } while(newrows!=0);

 /*
 ** linesrow[n]==0 indicate rows covered by imaginary line
 ** linescol[n]==1 indicate cols covered by imaginary line
 ** For all cells not covered by imaginary lines, determine smallest
 ** value.
 */
 smallest=MAXPOSLONG;
 for(i=0;i<ASSIGNROWS;i++)
 	if(linesrow[i]!=0)
 		for(j=0;j<ASSIGNCOLS;j++)
 			if(linescol[j]!=1)
 				if(tableau[i][j]<smallest)
 					smallest=tableau[i][j];

 /*
 ** Subtract smallest from all cells in the above set.
 */
 for(i=0;i<ASSIGNROWS;i++)
 	if(linesrow[i]!=0)
 		for(j=0;j<ASSIGNCOLS;j++)
 			if(linescol[j]!=1)
 				tableau[i][j]-=smallest;

 /*
 ** Add smallest to all cells covered by two lines.
 */
 for(i=0;i<ASSIGNROWS;i++)
 	if(linesrow[i]==0)
 		for(j=0;j<ASSIGNCOLS;j++)
 			if(linescol[j]==1)
 				tableau[i][j]+=smallest;

 return;
 }

 /********************
 ** IDEA Encryption **
 *********************
 ** IDEA - International Data Encryption Algorithm.
 ** Based on code presented in Applied Cryptography by Bruce Schneier.
 ** Which was based on code developed by Xuejia Lai and James L. Massey.
 ** Other modifications made by Colin Plumb.
 **
 */

 /***********
 ** DoIDEA **
 ************
 ** Perform IDEA encryption.  Note that we time encryption & decryption
 ** time as being a single loop.
 */
 void DoIDEA(void)
 {
 IDEAStruct *locideastruct;      /* Loc pointer to global structure */
 int i;
 IDEAkey Z,DK;
 u16 userkey[8];
 ulong accumtime;
 double iterations;
 char *errorcontext;
 int systemerror;
 faruchar *plain1;               /* First plaintext buffer */
 faruchar *crypt1;               /* Encryption buffer */
 faruchar *plain2;               /* Second plaintext buffer */

 /*
 ** Link to global data
 */
 locideastruct=&global_ideastruct;

 /*
 ** Set error context
 */
 errorcontext="CPU:IDEA";

 /*
 ** Re-init random-number generator.
 */
 /* randnum(3L); */
 randnum((int32)3);

 /*
 ** Build an encryption/decryption key
 */
 for (i=0;i<8;i++)
         /* userkey[i]=(u16)(abs_randwc(60000L) & 0xFFFF); */
 	userkey[i]=(u16)(abs_randwc((int32)60000) & 0xFFFF);
 for(i=0;i<KEYLEN;i++)
 	Z[i]=0;

 /*
 ** Compute encryption/decryption subkeys
 */
 en_key_idea(userkey,Z);
 de_key_idea(Z,DK);

 /*
 ** Allocate memory for buffers.  We'll make 3, called plain1,
 ** crypt1, and plain2.  It works like this:
 **   plain1 >>encrypt>> crypt1 >>decrypt>> plain2.
 ** So, plain1 and plain2 should match.
 ** Also, fill up plain1 with sample text.
 */
 plain1=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror);
 if(systemerror)
 {
 	ReportError(errorcontext,systemerror);
 	ErrorExit();
 }

 crypt1=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror);
 if(systemerror)
 {
 	ReportError(errorcontext,systemerror);
 	FreeMemory((farvoid *)plain1,&systemerror);
 	ErrorExit();
 }

 plain2=(faruchar *)AllocateMemory(locideastruct->arraysize,&systemerror);
 if(systemerror)
 {
 	ReportError(errorcontext,systemerror);
 	FreeMemory((farvoid *)plain1,&systemerror);
 	FreeMemory((farvoid *)crypt1,&systemerror);
 	ErrorExit();
 }
 /*
 ** Note that we build the "plaintext" by simply loading
 ** the array up with random numbers.
 */
 for(i=0;i<locideastruct->arraysize;i++)
 	plain1[i]=(uchar)(abs_randwc(255) & 0xFF);

 /*
 ** See if we need to perform self adjustment loop.
 */
 if(locideastruct->adjust==0)
 {
 	/*
 	** Do self-adjustment.  This involves initializing the
 	** # of loops and increasing the loop count until we
 	** get a number of loops that we can use.
 	*/
 	for(locideastruct->loops=100L;
 	  locideastruct->loops<MAXIDEALOOPS;
 	  locideastruct->loops+=10L)
 		if(DoIDEAIteration(plain1,crypt1,plain2,
 		  locideastruct->arraysize,
 		  locideastruct->loops,
 		  Z,DK)>global_min_ticks) break;
 }

 /*
 ** All's well if we get here.  Do the test.
 */
 accumtime=0L;
 iterations=(double)0.0;

 do {
 	accumtime+=DoIDEAIteration(plain1,crypt1,plain2,
 		locideastruct->arraysize,
 		locideastruct->loops,Z,DK);
 	iterations+=(double)locideastruct->loops;
 } while(iterations < BASE_ITERATIONS * 75);

 /*
 ** Clean up, calculate results, and go home.  Be sure to
 ** show that we don't have to rerun adjustment code.
 */
 FreeMemory((farvoid *)plain1,&systemerror);
 FreeMemory((farvoid *)crypt1,&systemerror);
 FreeMemory((farvoid *)plain2,&systemerror);
 locideastruct->iterspersec=iterations / TicksToFracSecs(accumtime);

 if(locideastruct->adjust==0)
 	locideastruct->adjust=1;

 return;

 }

 /********************
 ** DoIDEAIteration **
 *********************
 ** Execute a single iteration of the IDEA encryption algorithm.
 ** Actually, a single iteration is one encryption and one
 ** decryption.
 */
 static ulong DoIDEAIteration(faruchar *plain1,
 			faruchar *crypt1,
 			faruchar *plain2,
 			ulong arraysize,
 			ulong nloops,
 			IDEAkey Z,
 			IDEAkey DK)
 {
 register ulong i;
 register ulong j;
 ulong elapsed;
 #ifdef DEBUG
 int status=0;
 #endif

 /*
 ** Start the stopwatch.
 */
 elapsed=StartStopwatch();

 /*
 ** Do everything for nloops.
 */
 for(i=0;i<nloops;i++)
 {
 	for(j=0;j<arraysize;j+=(sizeof(u16)*4))
 		cipher_idea((u16 *)(plain1+j),(u16 *)(crypt1+j),Z);       /* Encrypt */

 	for(j=0;j<arraysize;j+=(sizeof(u16)*4))
 		cipher_idea((u16 *)(crypt1+j),(u16 *)(plain2+j),DK);      /* Decrypt */
 }

 #ifdef DEBUG
 for(j=0;j<arraysize;j++)
 	if(*(plain1+j)!=*(plain2+j)){
 		printf("IDEA Error! \n");
                 status=1;
                 }
 if (status==0) printf("IDEA: OK\n");
 #endif

 /*
 ** Get elapsed time.
 */
 return(StopStopwatch(elapsed));
 }

 /********
 ** mul **
 *********
 ** Performs multiplication, modulo (2**16)+1.  This code is structured
 ** on the assumption that untaken branches are cheaper than taken
 ** branches, and that the compiler doesn't schedule branches.
 */
 static u16 mul(register u16 a, register u16 b)
 {
 register u32 p;
 if(a)
 {       if(b)
 	{       p=(u32)(a*b);
 		b=low16(p);
 		a=(u16)(p>>16);
 		return(b-a+(b<a));
 	}
 	else
 		return(1-a);
 }
 else
 	return(1-b);
 }

 /********
 ** inv **
 *********
 ** Compute multiplicative inverse of x, modulo (2**16)+1
 ** using Euclid's GCD algorithm.  It is unrolled twice
 ** to avoid swapping the meaning of the registers.  And
 ** some subtracts are changed to adds.
 */
 static u16 inv(u16 x)
 {
 u16 t0, t1;
 u16 q, y;

 if(x<=1)
 	return(x);      /* 0 and 1 are self-inverse */
 t1=0x10001 / x;
 y=0x10001 % x;
 if(y==1)
 	return(low16(1-t1));
 t0=1;
 do {
 	q=x/y;
 	x=x%y;
 	t0+=q*t1;
 	if(x==1) return(t0);
 	q=y/x;
 	y=y%x;
 	t1+=q*t0;
 } while(y!=1);
 return(low16(1-t1));
 }

 /****************
 ** en_key_idea **
 *****************
 ** Compute IDEA encryption subkeys Z
 */
 static void en_key_idea(u16 *userkey, u16 *Z)
 {
 int i,j;

 /*
 ** shifts
 */
 for(j=0;j<8;j++)
 	Z[j]=*userkey++;
 for(i=0;j<KEYLEN;j++)
 {       i++;
 	Z[i+7]=(Z[i&7]<<9)| (Z[(i+1) & 7] >> 7);
 	Z+=i&8;
 	i&=7;
 }
 return;
 }

 /****************
 ** de_key_idea **
 *****************
 ** Compute IDEA decryption subkeys DK from encryption
 ** subkeys Z.
 */
 static void de_key_idea(IDEAkey Z, IDEAkey DK)
 {
 IDEAkey TT;
 int j;
 u16 t1, t2, t3;
 u16 *p;
 p=(u16 *)(TT+KEYLEN);

 t1=inv(*Z++);
 t2=-*Z++;
 t3=-*Z++;
 *--p=inv(*Z++);
 *--p=t3;
 *--p=t2;
 *--p=t1;

 for(j=1;j<ROUNDS;j++)
 {       t1=*Z++;
 	*--p=*Z++;
 	*--p=t1;
 	t1=inv(*Z++);
 	t2=-*Z++;
 	t3=-*Z++;
 	*--p=inv(*Z++);
 	*--p=t2;
 	*--p=t3;
 	*--p=t1;
 }
 t1=*Z++;
 *--p=*Z++;
 *--p=t1;
 t1=inv(*Z++);
 t2=-*Z++;
 t3=-*Z++;
 *--p=inv(*Z++);
 *--p=t3;
 *--p=t2;
 *--p=t1;
 /*
 ** Copy and destroy temp copy
 */
 for(j=0,p=TT;j<KEYLEN;j++)
 {       *DK++=*p;
 	*p++=0;
 }

 return;
 }

 /*
 ** MUL(x,y)
 ** This #define creates a macro that computes x=x*y modulo 0x10001.
 ** Requires temps t16 and t32.  Also requires y to be strictly 16
 ** bits.  Here, I am using the simplest form.  May not be the
 ** fastest. -- RG
 */
 /* #define MUL(x,y) (x=mul(low16(x),y)) */

 /****************
 ** cipher_idea **
 *****************
 ** IDEA encryption/decryption algorithm.
 */
 static void cipher_idea(u16 in[4],
 		u16 out[4],
 		register IDEAkey Z)
 {
 register u16 x1, x2, x3, x4, t1, t2;
 /* register u16 t16;
 register u16 t32; */
 int r=ROUNDS;

 x1=*in++;
 x2=*in++;
 x3=*in++;
 x4=*in;

 do {
 	MUL(x1,*Z++);
 	x2+=*Z++;
 	x3+=*Z++;
 	MUL(x4,*Z++);

 	t2=x1^x3;
 	MUL(t2,*Z++);
 	t1=t2+(x2^x4);
 	MUL(t1,*Z++);
 	t2=t1+t2;

 	x1^=t1;
 	x4^=t2;

 	t2^=x2;
 	x2=x3^t1;
 	x3=t2;
 } while(--r);
 MUL(x1,*Z++);
 *out++=x1;
 *out++=x3+*Z++;
 *out++=x2+*Z++;
 MUL(x4,*Z);
 *out=x4;
 return;
 }

 /************************
 ** HUFFMAN COMPRESSION **
 ************************/

 /**************
 ** DoHuffman **
 ***************
 ** Execute a huffman compression on a block of plaintext.
 ** Note that (as with IDEA encryption) an iteration of the
 ** Huffman test includes a compression AND a decompression.
 ** Also, the compression cycle includes building the
 ** Huffman tree.
 */
 void DoHuffman(void)
 {
 HuffStruct *lochuffstruct;      /* Loc pointer to global data */
 char *errorcontext;
 int systemerror;
 ulong accumtime;
 double iterations;
 farchar *comparray;
 farchar *decomparray;
 farchar *plaintext;

 /*
 ** Link to global data
 */
 lochuffstruct=&global_huffstruct;

 /*
 ** Set error context.
 */
 errorcontext="CPU:Huffman";

 /*
 ** Allocate memory for the plaintext and the compressed text.
 ** We'll be really pessimistic here, and allocate equal amounts
 ** for both (though we know...well, we PRESUME) the compressed
 ** stuff will take less than the plain stuff.
 ** Also note that we'll build a 3rd buffer to decompress
 ** into, and we preallocate space for the huffman tree.
 ** (We presume that the Huffman tree will grow no larger
 ** than 512 bytes.  This is actually a super-conservative
 ** estimate...but, who cares?)
 */
 plaintext=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror);
 if(systemerror)
 {       ReportError(errorcontext,systemerror);
 	ErrorExit();
 }
 comparray=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror);
 if(systemerror)
 {       ReportError(errorcontext,systemerror);
 	FreeMemory(plaintext,&systemerror);
 	ErrorExit();
 }
 decomparray=(farchar *)AllocateMemory(lochuffstruct->arraysize,&systemerror);
 if(systemerror)
 {       ReportError(errorcontext,systemerror);
 	FreeMemory(plaintext,&systemerror);
 	FreeMemory(comparray,&systemerror);
 	ErrorExit();
 }

 hufftree=(huff_node *)AllocateMemory(sizeof(huff_node) * 512,
 	&systemerror);
 if(systemerror)
 {       ReportError(errorcontext,systemerror);
 	FreeMemory(plaintext,&systemerror);
 	FreeMemory(comparray,&systemerror);
 	FreeMemory(decomparray,&systemerror);
 	ErrorExit();
 }

 /*
 ** Build the plaintext buffer.  Since we want this to
 ** actually be able to compress, we'll use the
 ** wordcatalog to build the plaintext stuff.
 */
 /*
 ** Reset random number generator so things repeat.
 ** added by Uwe F. Mayer
 */
 randnum((int32)13);
 create_text_block(plaintext,lochuffstruct->arraysize-1,(ushort)500);
 plaintext[lochuffstruct->arraysize-1L]='\0';
 plaintextlen=lochuffstruct->arraysize;

 /*
 ** See if we need to perform self adjustment loop.
 */
 if(lochuffstruct->adjust==0)
 {
 	/*
 	** Do self-adjustment.  This involves initializing the
 	** # of loops and increasing the loop count until we
 	** get a number of loops that we can use.
 	*/
 	for(lochuffstruct->loops=100L;
 	  lochuffstruct->loops<MAXHUFFLOOPS;
 	  lochuffstruct->loops+=10L)
 		if(DoHuffIteration(plaintext,
 			comparray,
 			decomparray,
 		  lochuffstruct->arraysize,
 		  lochuffstruct->loops,
 		  hufftree)>global_min_ticks) break;
 }

 /*
 ** All's well if we get here.  Do the test.
 */
 accumtime=0L;
 iterations=(double)0.0;

 do {
 	accumtime+=DoHuffIteration(plaintext,
 		comparray,
 		decomparray,
 		lochuffstruct->arraysize,
 		lochuffstruct->loops,
 		hufftree);
 	iterations+=(double)lochuffstruct->loops;
 } while(iterations < BASE_ITERATIONS * 20);

 /*
 ** Clean up, calculate results, and go home.  Be sure to
 ** show that we don't have to rerun adjustment code.
 */
 FreeMemory((farvoid *)plaintext,&systemerror);
 FreeMemory((farvoid *)comparray,&systemerror);
 FreeMemory((farvoid *)decomparray,&systemerror);
 FreeMemory((farvoid *)hufftree,&systemerror);
 lochuffstruct->iterspersec=iterations / TicksToFracSecs(accumtime);

 if(lochuffstruct->adjust==0)
 	lochuffstruct->adjust=1;

 }

 /*********************
 ** create_text_line **
 **********************
 ** Create a random line of text, stored at *dt.  The line may be
 ** no more than nchars long.
 */
 static void create_text_line(farchar *dt,
 			long nchars)
 {
 long charssofar;        /* # of characters so far */
 long tomove;            /* # of characters to move */
 char myword[40];        /* Local buffer for words */
 farchar *wordptr;       /* Pointer to word from catalog */

 charssofar=0;

 do {
 /*
 ** Grab a random word from the wordcatalog
 */
 /* wordptr=wordcatarray[abs_randwc((long)WORDCATSIZE)];*/
 wordptr=wordcatarray[abs_randwc((int32)WORDCATSIZE)];
 MoveMemory((farvoid *)myword,
 	(farvoid *)wordptr,
 	(unsigned long)strlen(wordptr)+1);

 /*
 ** Append a blank.
 */
 tomove=strlen(myword)+1;
 myword[tomove-1]=' ';

 /*
 ** See how long it is.  If its length+charssofar > nchars, we have
 ** to trim it.
 */
 if((tomove+charssofar)>nchars)
 	tomove=nchars-charssofar;
 /*
 ** Attach the word to the current line.  Increment counter.
 */
 MoveMemory((farvoid *)dt,(farvoid *)myword,(unsigned long)tomove);
 charssofar+=tomove;
 dt+=tomove;

 /*
 ** If we're done, bail out.  Otherwise, go get another word.
 */
 } while(charssofar<nchars);

 return;
 }

 /**********************
 ** create_text_block **
 ***********************
 ** Build a block of text randomly loaded with words.  The words
 ** come from the wordcatalog (which must be loaded before you
 ** call this).
 ** *tb points to the memory where the text is to be built.
 ** tblen is the # of bytes to put into the text block
 ** maxlinlen is the maximum length of any line (line end indicated
 **  by a carriage return).
 */
 static void create_text_block(farchar *tb,
 			ulong tblen,
 			ushort maxlinlen)
 {
 ulong bytessofar;       /* # of bytes so far */
 ulong linelen;          /* Line length */

 bytessofar=0L;
 do {

 /*
 ** Pick a random length for a line and fill the line.
 ** Make sure the line can fit (haven't exceeded tablen) and also
 ** make sure you leave room to append a carriage return.
 */
 linelen=abs_randwc(maxlinlen-6)+6;
 if((linelen+bytessofar)>tblen)
 	linelen=tblen-bytessofar;

 if(linelen>1)
 {
 	create_text_line(tb,linelen);
 }
 tb+=linelen-1;          /* Add the carriage return */
 *tb++='\n';

 bytessofar+=linelen;

 } while(bytessofar<tblen);

 }

 /********************
 ** DoHuffIteration **
 *********************
 ** Perform the huffman benchmark.  This routine
 **  (a) Builds the huffman tree
 **  (b) Compresses the text
 **  (c) Decompresses the text and verifies correct decompression
 */
 static ulong DoHuffIteration(farchar *plaintext,
 	farchar *comparray,
 	farchar *decomparray,
 	ulong arraysize,
 	ulong nloops,
 	huff_node *hufftree)
 {
 int i;                          /* Index */
 long j;                         /* Bigger index */
 int root;                       /* Pointer to huffman tree root */
 float lowfreq1, lowfreq2;       /* Low frequency counters */
 int lowidx1, lowidx2;           /* Indexes of low freq. elements */
 long bitoffset;                 /* Bit offset into text */
 long textoffset;                /* Char offset into text */
 long maxbitoffset;              /* Holds limit of bit offset */
 long bitstringlen;              /* Length of bitstring */
 int c;                          /* Character from plaintext */
 char bitstring[30];             /* Holds bitstring */
 ulong elapsed;                  /* For stopwatch */
 #ifdef DEBUG
 int status=0;
 #endif

 /*
 ** Start the stopwatch
 */
 elapsed=StartStopwatch();

 /*
 ** Do everything for nloops
 */
 while(nloops--)
 {

 /*
 ** Calculate the frequency of each byte value. Store the
 ** results in what will become the "leaves" of the
 ** Huffman tree.  Interior nodes will be built in those
 ** nodes greater than node #255.
 */
 for(i=0;i<256;i++)
 {
 	hufftree[i].freq=(float)0.0;
 	hufftree[i].c=(unsigned char)i;
 }

 for(j=0;j<arraysize;j++)
 	hufftree[(int)plaintext[j]].freq+=(float)1.0;

 for(i=0;i<256;i++)
 	if(hufftree[i].freq != (float)0.0)
 		hufftree[i].freq/=(float)arraysize;

 /* Reset the second half of the tree. Otherwise the loop below that
 ** compares the frequencies up to index 512 makes no sense. Some
 ** systems automatically zero out memory upon allocation, others (like
 ** for example DEC Unix) do not. Depending on this the loop below gets
 ** different data and different run times. On our alpha the data that
 ** was arbitrarily assigned led to an underflow error at runtime. We
 ** use that zeroed-out bits are in fact 0 as a float.
 ** Uwe F. Mayer */
 bzero((char *)&(hufftree[256]),sizeof(huff_node)*256);
 /*
 ** Build the huffman tree.  First clear all the parent
 ** pointers and left/right pointers.  Also, discard all
 ** nodes that have a frequency of true 0.  */
 for(i=0;i<512;i++)
 {       if(hufftree[i].freq==(float)0.0)
 		hufftree[i].parent=EXCLUDED;
 	else
 		hufftree[i].parent=hufftree[i].left=hufftree[i].right=-1;
 }

 /*
 ** Go through the tree. Finding nodes of really low
 ** frequency.
 */
 root=255;                       /* Starting root node-1 */
 while(1)
 {
 	lowfreq1=(float)2.0; lowfreq2=(float)2.0;
 	lowidx1=-1; lowidx2=-1;
 	/*
 	** Find first lowest frequency.
 	*/
 	for(i=0;i<=root;i++)
 		if(hufftree[i].parent<0)
 			if(hufftree[i].freq<lowfreq1)
 			{       lowfreq1=hufftree[i].freq;
 				lowidx1=i;
 			}

 	/*
 	** Did we find a lowest value?  If not, the
 	** tree is done.
 	*/
 	if(lowidx1==-1) break;

 	/*
 	** Find next lowest frequency
 	*/
 	for(i=0;i<=root;i++)
 		if((hufftree[i].parent<0) && (i!=lowidx1))
 			if(hufftree[i].freq<lowfreq2)
 			{       lowfreq2=hufftree[i].freq;
 				lowidx2=i;
 			}

 	/*
 	** If we could only find one item, then that
 	** item is surely the root, and (as above) the
 	** tree is done.
 	*/
 	if(lowidx2==-1) break;

 	/*
 	** Attach the two new nodes to the current root, and
 	** advance the current root.
 	*/
 	root++;                 /* New root */
 	hufftree[lowidx1].parent=root;
 	hufftree[lowidx2].parent=root;
 	hufftree[root].freq=lowfreq1+lowfreq2;
 	hufftree[root].left=lowidx1;
 	hufftree[root].right=lowidx2;
 	hufftree[root].parent=-2;       /* Show root */
 }

 /*
 ** Huffman tree built...compress the plaintext
 */
 bitoffset=0L;                           /* Initialize bit offset */
 for(i=0;i<arraysize;i++)
 {
 	c=(int)plaintext[i];                 /* Fetch character */
 	/*
 	** Build a bit string for byte c
 	*/
 	bitstringlen=0;
 	while(hufftree[c].parent!=-2)
 	{       if(hufftree[hufftree[c].parent].left==c)
 			bitstring[bitstringlen]='0';
 		else
 			bitstring[bitstringlen]='1';
 		c=hufftree[c].parent;
 		bitstringlen++;
 	}

 	/*
 	** Step backwards through the bit string, setting
 	** bits in the compressed array as you go.
 	*/
 	while(bitstringlen--)
 	{       SetCompBit((u8 *)comparray,(u32)bitoffset,bitstring[bitstringlen]);
 		bitoffset++;
 	}
 }

 /*
 ** Compression done.  Perform de-compression.
 */
 maxbitoffset=bitoffset;
 bitoffset=0;
 textoffset=0;
 do {
 	i=root;
 	while(hufftree[i].left!=-1)
 	{       if(GetCompBit((u8 *)comparray,(u32)bitoffset)==0)
 			i=hufftree[i].left;
 		else
 			i=hufftree[i].right;
 		bitoffset++;
 	}
 	decomparray[textoffset]=hufftree[i].c;

 #ifdef DEBUG
 	if(hufftree[i].c != plaintext[textoffset])
 	{
 		/* Show error */
 		printf("Error at textoffset %ld\n",textoffset);
 		status=1;
 	}
 #endif
 	textoffset++;
 } while(bitoffset<maxbitoffset);

 }       /* End the big while(nloops--) from above */

 /*
 ** All done
 */
 #ifdef DEBUG
   if (status==0) printf("Huffman: OK\n");
 #endif
 return(StopStopwatch(elapsed));
 }

 /***************
 ** SetCompBit **
 ****************
 ** Set a bit in the compression array.  The value of the
 ** bit is set according to char bitchar.
 */
 static void SetCompBit(u8 *comparray,
 		u32 bitoffset,
 		char bitchar)
 {
 u32 byteoffset;
 int bitnumb;

 /*
 ** First calculate which element in the comparray to
 ** alter. and the bitnumber.
 */
 byteoffset=bitoffset>>3;
 bitnumb=bitoffset % 8;

 /*
 ** Set or clear
 */
 if(bitchar=='1')
 	comparray[byteoffset]|=(1<<bitnumb);
 else
 	comparray[byteoffset]&=~(1<<bitnumb);

 return;
 }

 /***************
 ** GetCompBit **
 ****************
 ** Return the bit value of a bit in the comparession array.
 ** Returns 0 if the bit is clear, nonzero otherwise.
 */
 static int GetCompBit(u8 *comparray,
 		u32 bitoffset)
 {
 u32 byteoffset;
 int bitnumb;

 /*
 ** Calculate byte offset and bit number.
 */
 byteoffset=bitoffset>>3;
 bitnumb=bitoffset % 8;

 /*
 ** Fetch
 */
 return((1<<bitnumb) & comparray[byteoffset] );
 }

 /********************************
 ** BACK PROPAGATION NEURAL NET **
 *********************************
 ** This code is a modified version of the code
 ** that was submitted to BYTE Magazine by
 ** Maureen Caudill.  It accomanied an article
 ** that I CANNOT NOW RECALL.
 ** The author's original heading/comment was
 ** as follows:
 **
 **  Backpropagation Network
 **  Written by Maureen Caudill
 **  in Think C 4.0 on a Macintosh
 **
 **  (c) Maureen Caudill 1988-1991
 **  This network will accept 5x7 input patterns
 **  and produce 8 bit output patterns.
 **  The source code may be copied or modified without restriction,
 **  but no fee may be charged for its use.
 **
 ** ++++++++++++++
 ** I have modified the code so that it will work
 ** on systems other than a Macintosh -- RG
 */

 /***********
 ** DoNNet **
 ************
 ** Perform the neural net benchmark.
 ** Note that this benchmark is one of the few that
 ** requires an input file.  That file is "NNET.DAT" and
 ** should be on the local directory (from which the
 ** benchmark program in launched).
 */
 void DoNNET(void)
 {
 NNetStruct *locnnetstruct;      /* Local ptr to global data */
 char *errorcontext;
 ulong accumtime;
 double iterations;

 /*
 ** Link to global data
 */
 locnnetstruct=&global_nnetstruct;

 /*
 ** Set error context
 */
 errorcontext="CPU:NNET";

 /*
 ** Init random number generator.
 ** NOTE: It is important that the random number generator
 **  be re-initialized for every pass through this test.
 **  The NNET algorithm uses the random number generator
 **  to initialize the net.  Results are sensitive to
 **  the initial neural net state.
 */
 /* randnum(3L); */
 randnum((int32)3);

 /*
 ** Read in the input and output patterns.  We'll do this
 ** only once here at the beginning.  These values don't
 ** change once loaded.
 */
 if(read_data_file()!=0)
    ErrorExit();


 /*
 ** See if we need to perform self adjustment loop.
 */
 if(locnnetstruct->adjust==0)
 {
 	/*
 	** Do self-adjustment.  This involves initializing the
 	** # of loops and increasing the loop count until we
 	** get a number of loops that we can use.
 	*/
 	for(locnnetstruct->loops=1L;
 	  locnnetstruct->loops<MAXNNETLOOPS;
 	  locnnetstruct->loops++)
 	  {     /*randnum(3L); */
 		randnum((int32)3);
 		if(DoNNetIteration(locnnetstruct->loops)
 			>global_min_ticks) break;
 	  }
 }

 /*
 ** All's well if we get here.  Do the test.
 */
 accumtime=0L;
 iterations=(double)0.0;

 do {
 	/* randnum(3L); */    /* Gotta do this for Neural Net */
 	randnum((int32)3);    /* Gotta do this for Neural Net */
 	accumtime+=DoNNetIteration(locnnetstruct->loops);
 	iterations+=(double)locnnetstruct->loops;
 } while(iterations < BASE_ITERATIONS/2);

 /*
 ** Clean up, calculate results, and go home.  Be sure to
 ** show that we don't have to rerun adjustment code.
 */
 locnnetstruct->iterspersec=iterations / TicksToFracSecs(accumtime);

 if(locnnetstruct->adjust==0)
 	locnnetstruct->adjust=1;


 return;
 }

 /********************
 ** DoNNetIteration **
 *********************
 ** Do a single iteration of the neural net benchmark.
 ** By iteration, we mean a "learning" pass.
 */
 static ulong DoNNetIteration(ulong nloops)
 {
 ulong elapsed;          /* Elapsed time */
 int patt;

 /*
 ** Run nloops learning cycles.  Notice that, counted with
 ** the learning cycle is the weight randomization and
 ** zeroing of changes.  This should reduce clock jitter,
 ** since we don't have to stop and start the clock for
 ** each iteration.
 */
 elapsed=StartStopwatch();
 while(nloops--)
 {
 	randomize_wts();
 	zero_changes();
 	iteration_count=1;
 	learned = F;
 	numpasses = 0;
 	while (learned == F)
 	{
 		for (patt=0; patt<numpats; patt++)
 		{
 			worst_error = 0.0;      /* reset this every pass through data */
 			move_wt_changes();      /* move last pass's wt changes to momentum array */
 			do_forward_pass(patt);
 			do_back_pass(patt);
 			iteration_count++;
 		}
 		numpasses ++;
 		learned = check_out_error();
 	}
 #ifdef DEBUG
 printf("Learned in %d passes\n",numpasses);
 #endif
 }
 return(StopStopwatch(elapsed));
 }

 /*************************
 ** do_mid_forward(patt) **
 **************************
 ** Process the middle layer's forward pass
 ** The activation of middle layer's neurode is the weighted
 ** sum of the inputs from the input pattern, with sigmoid
 ** function applied to the inputs.
 **/
 static void  do_mid_forward(int patt)
 {
 double  sum;
 int     neurode, i;

 for (neurode=0;neurode<MID_SIZE; neurode++)
 {
 	sum = 0.0;
 	for (i=0; i<IN_SIZE; i++)
 	{       /* compute weighted sum of input signals */
 		sum += mid_wts[neurode][i]*in_pats[patt][i];
 	}
 	/*
 	** apply sigmoid function f(x) = 1/(1+exp(-x)) to weighted sum
 	*/
 	sum = 1.0/(1.0+exp(-sum));
 	mid_out[neurode] = sum;
 }
 return;
 }

 /*********************
 ** do_out_forward() **
 **********************
 ** process the forward pass through the output layer
 ** The activation of the output layer is the weighted sum of
 ** the inputs (outputs from middle layer), modified by the
 ** sigmoid function.
 **/
 static void  do_out_forward()
 {
 double sum;
 int neurode, i;

 for (neurode=0; neurode<OUT_SIZE; neurode++)
 {
 	sum = 0.0;
 	for (i=0; i<MID_SIZE; i++)
 	{       /*
 		** compute weighted sum of input signals
 		** from middle layer
 		*/
 		sum += out_wts[neurode][i]*mid_out[i];
 	}
 	/*
 	** Apply f(x) = 1/(1+exp(-x)) to weighted input
 	*/
 	sum = 1.0/(1.0+exp(-sum));
 	out_out[neurode] = sum;
 }
 return;
 }

 /*************************
 ** display_output(patt) **
 **************************
 ** Display the actual output vs. the desired output of the
 ** network.
 ** Once the training is complete, and the "learned" flag set
 ** to TRUE, then display_output sends its output to both
 ** the screen and to a text output file.
 **
 ** NOTE: This routine has been disabled in the benchmark
 ** version. -- RG
 **/
 /*
 void  display_output(int patt)
 {
 int             i;

 	fprintf(outfile,"\n Iteration # %d",iteration_count);
 	fprintf(outfile,"\n Desired Output:  ");

 	for (i=0; i<OUT_SIZE; i++)
 	{
 		fprintf(outfile,"%6.3f  ",out_pats[patt][i]);
 	}
 	fprintf(outfile,"\n Actual Output:   ");

 	for (i=0; i<OUT_SIZE; i++)
 	{
 		fprintf(outfile,"%6.3f  ",out_out[i]);
 	}
 	fprintf(outfile,"\n");
 	return;
 }
 */

 /**********************
 ** do_forward_pass() **
 ***********************
 ** control function for the forward pass through the network
 ** NOTE: I have disabled the call to display_output() in
 **  the benchmark version -- RG.
 **/
 static void  do_forward_pass(int patt)
 {
 do_mid_forward(patt);   /* process forward pass, middle layer */
 do_out_forward();       /* process forward pass, output layer */
 /* display_output(patt);        ** display results of forward pass */
 return;
 }

 /***********************
 ** do_out_error(patt) **
 ************************
 ** Compute the error for the output layer neurodes.
 ** This is simply Desired - Actual.
 **/
 static void do_out_error(int patt)
 {
 int neurode;
 double error,tot_error, sum;

 tot_error = 0.0;
 sum = 0.0;
 for (neurode=0; neurode<OUT_SIZE; neurode++)
 {
 	out_error[neurode] = out_pats[patt][neurode] - out_out[neurode];
 	/*
 	** while we're here, also compute magnitude
 	** of total error and worst error in this pass.
 	** We use these to decide if we are done yet.
 	*/
 	error = out_error[neurode];
 	if (error <0.0)
 	{
 		sum += -error;
 		if (-error > tot_error)
 			tot_error = -error; /* worst error this pattern */
 	}
 	else
 	{
 		sum += error;
 		if (error > tot_error)
 			tot_error = error; /* worst error this pattern */
 	}
 }
 avg_out_error[patt] = sum/OUT_SIZE;
 tot_out_error[patt] = tot_error;
 return;
 }

 /***********************
 ** worst_pass_error() **
 ************************
 ** Find the worst and average error in the pass and save it
 **/
 static void  worst_pass_error()
 {
 double error,sum;

 int i;

 error = 0.0;
 sum = 0.0;
 for (i=0; i<numpats; i++)
 {
 	if (tot_out_error[i] > error) error = tot_out_error[i];
 	sum += avg_out_error[i];
 }
 worst_error = error;
 average_error = sum/numpats;
 return;
 }

 /*******************
 ** do_mid_error() **
 ********************
 ** Compute the error for the middle layer neurodes
 ** This is based on the output errors computed above.
 ** Note that the derivative of the sigmoid f(x) is
 **        f'(x) = f(x)(1 - f(x))
 ** Recall that f(x) is merely the output of the middle
 ** layer neurode on the forward pass.
 **/
 static void do_mid_error()
 {
 double sum;
 int neurode, i;

 for (neurode=0; neurode<MID_SIZE; neurode++)
 {
 	sum = 0.0;
 	for (i=0; i<OUT_SIZE; i++)
 		sum += out_wts[i][neurode]*out_error[i];

 	/*
 	** apply the derivative of the sigmoid here
 	** Because of the choice of sigmoid f(I), the derivative
 	** of the sigmoid is f'(I) = f(I)(1 - f(I))
 	*/
 	mid_error[neurode] = mid_out[neurode]*(1-mid_out[neurode])*sum;
 }
 return;
 }

 /*********************
 ** adjust_out_wts() **
 **********************
 ** Adjust the weights of the output layer.  The error for
 ** the output layer has been previously propagated back to
 ** the middle layer.
 ** Use the Delta Rule with momentum term to adjust the weights.
 **/
 static void adjust_out_wts()
 {
 int weight, neurode;
 double learn,delta,alph;

 learn = BETA;
 alph  = ALPHA;
 for (neurode=0; neurode<OUT_SIZE; neurode++)
 {
 	for (weight=0; weight<MID_SIZE; weight++)
 	{
 		/* standard delta rule */
 		delta = learn * out_error[neurode] * mid_out[weight];

 		/* now the momentum term */
 		delta += alph * out_wt_change[neurode][weight];
 		out_wts[neurode][weight] += delta;

 		/* keep track of this pass's cum wt changes for next pass's momentum */
 		out_wt_cum_change[neurode][weight] += delta;
 	}
 }
 return;
 }

 /*************************
 ** adjust_mid_wts(patt) **
 **************************
 ** Adjust the middle layer weights using the previously computed
 ** errors.
 ** We use the Generalized Delta Rule with momentum term
 **/
 static void adjust_mid_wts(int patt)
 {
 int weight, neurode;
 double learn,alph,delta;

 learn = BETA;
 alph  = ALPHA;
 for (neurode=0; neurode<MID_SIZE; neurode++)
 {
 	for (weight=0; weight<IN_SIZE; weight++)
 	{
 		/* first the basic delta rule */
 		delta = learn * mid_error[neurode] * in_pats[patt][weight];

 		/* with the momentum term */
 		delta += alph * mid_wt_change[neurode][weight];
 		mid_wts[neurode][weight] += delta;

 		/* keep track of this pass's cum wt changes for next pass's momentum */
 		mid_wt_cum_change[neurode][weight] += delta;
 	}
 }
 return;
 }

 /*******************
 ** do_back_pass() **
 ********************
 ** Process the backward propagation of error through network.
 **/
 void  do_back_pass(int patt)
 {

 do_out_error(patt);
 do_mid_error();
 adjust_out_wts();
 adjust_mid_wts(patt);

 return;
 }


 /**********************
 ** move_wt_changes() **
 ***********************
 ** Move the weight changes accumulated last pass into the wt-change
 ** array for use by the momentum term in this pass. Also zero out
 ** the accumulating arrays after the move.
 **/
 static void move_wt_changes()
 {
 int i,j;

 for (i = 0; i<MID_SIZE; i++)
 	for (j = 0; j<IN_SIZE; j++)
 	{
 		mid_wt_change[i][j] = mid_wt_cum_change[i][j];
 		/*
 		** Zero it out for next pass accumulation.
 		*/
 		mid_wt_cum_change[i][j] = 0.0;
 	}

 for (i = 0; i<OUT_SIZE; i++)
 	for (j=0; j<MID_SIZE; j++)
 	{
 		out_wt_change[i][j] = out_wt_cum_change[i][j];
 		out_wt_cum_change[i][j] = 0.0;
 	}

 return;
 }

 /**********************
 ** check_out_error() **
 ***********************
 ** Check to see if the error in the output layer is below
 ** MARGIN*OUT_SIZE for all output patterns.  If so, then
 ** assume the network has learned acceptably well.  This
 ** is simply an arbitrary measure of how well the network
 ** has learned -- many other standards are possible.
 **/
 static int check_out_error()
 {
 int result,i,error;

 result  = T;
 error   = F;
 worst_pass_error();     /* identify the worst error in this pass */

 /*
 #ifdef DEBUG
 printf("\n Iteration # %d",iteration_count);
 #endif
 */
 for (i=0; i<numpats; i++)
 {
 /*      printf("\n Error pattern %d:   Worst: %8.3f; Average: %8.3f",
 	  i+1,tot_out_error[i], avg_out_error[i]);
 	fprintf(outfile,
 	 "\n Error pattern %d:   Worst: %8.3f; Average: %8.3f",
 	 i+1,tot_out_error[i]);
 */

 	if (worst_error >= STOP) result = F;
 	if (tot_out_error[i] >= 16.0) error = T;
 }

 if (error == T) result = ERR;


 #ifdef DEBUG
 /* printf("\n Error this pass thru data:   Worst: %8.3f; Average: %8.3f",
  worst_error,average_error);
 */
 /* fprintf(outfile,
  "\n Error this pass thru data:   Worst: %8.3f; Average: %8.3f",
   worst_error, average_error); */
 #endif

 return(result);
 }


 /*******************
 ** zero_changes() **
 ********************
 ** Zero out all the wt change arrays
 **/
 static void zero_changes()
 {
 int i,j;

 for (i = 0; i<MID_SIZE; i++)
 {
 	for (j=0; j<IN_SIZE; j++)
 	{
 		mid_wt_change[i][j] = 0.0;
 		mid_wt_cum_change[i][j] = 0.0;
 	}
 }

 for (i = 0; i< OUT_SIZE; i++)
 {
 	for (j=0; j<MID_SIZE; j++)
 	{
 		out_wt_change[i][j] = 0.0;
 		out_wt_cum_change[i][j] = 0.0;
 	}
 }
 return;
 }


 /********************
 ** randomize_wts() **
 *********************
 ** Intialize the weights in the middle and output layers to
 ** random values between -0.25..+0.25
 ** Function rand() returns a value between 0 and 32767.
 **
 ** NOTE: Had to make alterations to how the random numbers were
 ** created.  -- RG.
 **/
 static void randomize_wts()
 {
 int neurode,i;
 double value;

 /*
 ** Following not used int benchmark version -- RG
 **
 **        printf("\n Please enter a random number seed (1..32767):  ");
 **        scanf("%d", &i);
 **        srand(i);
 */

 for (neurode = 0; neurode<MID_SIZE; neurode++)
 {
 	for(i=0; i<IN_SIZE; i++)
 	{
 	        /* value=(double)abs_randwc(100000L); */
 		value=(double)abs_randwc((int32)100000);
 		value=value/(double)100000.0 - (double) 0.5;
 		mid_wts[neurode][i] = value/2;
 	}
 }
 for (neurode=0; neurode<OUT_SIZE; neurode++)
 {
 	for(i=0; i<MID_SIZE; i++)
 	{
 	        /* value=(double)abs_randwc(100000L); */
 		value=(double)abs_randwc((int32)100000);
 		value=value/(double)10000.0 - (double) 0.5;
 		out_wts[neurode][i] = value/2;
 	}
 }

 return;
 }


 /*********************
 ** read_data_file() **
 **********************
 ** Read in the input data file and store the patterns in
 ** in_pats and out_pats.
 ** The format for the data file is as follows:
 **
 ** line#   data expected
 ** -----   ------------------------------
 ** 1               In-X-size,in-y-size,out-size
 ** 2               number of patterns in file
 ** 3               1st X row of 1st input pattern
 ** 4..             following rows of 1st input pattern pattern
 **                 in-x+2  y-out pattern
 **                                 1st X row of 2nd pattern
 **                 etc.
 **
 ** Each row of data is separated by commas or spaces.
 ** The data is expected to be ascii text corresponding to
 ** either a +1 or a 0.
 **
 ** Sample input for a 1-pattern file (The comments to the
 ** right may NOT be in the file unless more sophisticated
 ** parsing of the input is done.):
 **
 ** 5,7,8                      input is 5x7 grid, output is 8 bits
 ** 1                          one pattern in file
 ** 0,1,1,1,0                  beginning of pattern for "O"
 ** 1,0,0,0,1
 ** 1,0,0,0,1
 ** 1,0,0,0,1
 ** 1,0,0,0,1
 ** 1,0,0,0,0
 ** 0,1,1,1,0
 ** 0,1,0,0,1,1,1,1            ASCII code for "O" -- 0100 1111
 **
 ** Clearly, this simple scheme can be expanded or enhanced
 ** any way you like.
 **
 ** Returns -1 if any file error occurred, otherwise 0.
 **/
 static int read_data_file()
 {
 FILE *infile;

 int xinsize,yinsize,youtsize;
 int patt, element, i, row;
 int vals_read;
 int val1,val2,val3,val4,val5,val6,val7,val8;

 /* printf("\n Opening and retrieving data from file."); */

 infile = fopen(inpath, "r");
 if (infile == NULL)
 {
 	printf("\n CPU:NNET--error in opening file!");
 	return -1 ;
 }
 vals_read =fscanf(infile,"%d  %d  %d",&xinsize,&yinsize,&youtsize);
 if (vals_read != 3)
 {
 	printf("\n CPU:NNET -- Should read 3 items in line one; did read %d",vals_read);
 	return -1;
 }
 vals_read=fscanf(infile,"%d",&numpats);
 if (vals_read !=1)
 {
 	printf("\n CPU:NNET -- Should read 1 item in line 2; did read %d",vals_read);
 	return -1;
 }
 if (numpats > MAXPATS)
 	numpats = MAXPATS;

 for (patt=0; patt<numpats; patt++)
 {
 	element = 0;
 	for (row = 0; row<yinsize; row++)
 	{
 		vals_read = fscanf(infile,"%d  %d  %d  %d  %d",
 			&val1, &val2, &val3, &val4, &val5);
 		if (vals_read != 5)
 		{
 			printf ("\n CPU:NNET -- failure in reading input!");
 			return -1;
 		}
 		element=row*xinsize;

 		in_pats[patt][element] = (double) val1; element++;
 		in_pats[patt][element] = (double) val2; element++;
 		in_pats[patt][element] = (double) val3; element++;
 		in_pats[patt][element] = (double) val4; element++;
 		in_pats[patt][element] = (double) val5; element++;
 	}
 	for (i=0;i<IN_SIZE; i++)
 	{
 		if (in_pats[patt][i] >= 0.9)
 			in_pats[patt][i] = 0.9;
 		if (in_pats[patt][i] <= 0.1)
 			in_pats[patt][i] = 0.1;
 	}
 	element = 0;
 	vals_read = fscanf(infile,"%d  %d  %d  %d  %d  %d  %d  %d",
 		&val1, &val2, &val3, &val4, &val5, &val6, &val7, &val8);

 	out_pats[patt][element] = (double) val1; element++;
 	out_pats[patt][element] = (double) val2; element++;
 	out_pats[patt][element] = (double) val3; element++;
 	out_pats[patt][element] = (double) val4; element++;
 	out_pats[patt][element] = (double) val5; element++;
 	out_pats[patt][element] = (double) val6; element++;
 	out_pats[patt][element] = (double) val7; element++;
 	out_pats[patt][element] = (double) val8; element++;
 }

 /* printf("\n Closing the input file now. "); */

 fclose(infile);
 return(0);
 }

 /*********************
 ** initialize_net() **
 **********************
 ** Do all the initialization stuff before beginning
 */
 /*
 static int initialize_net()
 {
 int err_code;

 randomize_wts();
 zero_changes();
 err_code = read_data_file();
 iteration_count = 1;
 return(err_code);
 }
 */

 /**********************
 ** display_mid_wts() **
 ***********************
 ** Display the weights on the middle layer neurodes
 ** NOTE: This routine is not used in the benchmark
 **  test -- RG
 **/
 /* static void display_mid_wts()
 {
 int             neurode, weight, row, col;

 fprintf(outfile,"\n Weights of Middle Layer neurodes:");

 for (neurode=0; neurode<MID_SIZE; neurode++)
 {
 	fprintf(outfile,"\n  Mid Neurode # %d",neurode);
 	for (row=0; row<IN_Y_SIZE; row++)
 	{
 		fprintf(outfile,"\n ");
 		for (col=0; col<IN_X_SIZE; col++)
 		{
 			weight = IN_X_SIZE * row + col;
 			fprintf(outfile," %8.3f ", mid_wts[neurode][weight]);
 		}
 	}
 }
 return;
 }
 */
 /**********************
 ** display_out_wts() **
 ***********************
 ** Display the weights on the output layer neurodes
 ** NOTE: This code is not used in the benchmark
 **  test -- RG
 */
 /* void  display_out_wts()
 {
 int             neurode, weight;

 	fprintf(outfile,"\n Weights of Output Layer neurodes:");

 	for (neurode=0; neurode<OUT_SIZE; neurode++)
 	{
 		fprintf(outfile,"\n  Out Neurode # %d \n",neurode);
 		for (weight=0; weight<MID_SIZE; weight++)
 		{
 			fprintf(outfile," %8.3f ", out_wts[neurode][weight]);
 		}
 	}
 	return;
 }
 */

 /***********************
 **  LU DECOMPOSITION  **
 ** (Linear Equations) **
 ************************
 ** These routines come from "Numerical Recipes in Pascal".
 ** Note that, as in the assignment algorithm, though we
 ** separately define LUARRAYROWS and LUARRAYCOLS, the two
 ** must be the same value (this routine depends on a square
 ** matrix).
 */

 /*********
 ** DoLU **
 **********
 ** Perform the LU decomposition benchmark.
 */
 void DoLU(void)
 {
 LUStruct *loclustruct;  /* Local pointer to global data */
 char *errorcontext;
 int systemerror;
 fardouble *a;
 fardouble *b;
 fardouble *abase;
 fardouble *bbase;
 LUdblptr ptra;
 int n;
 int i;
 ulong accumtime;
 double iterations;

 /*
 ** Link to global data
 */
 loclustruct=&global_lustruct;

 /*
 ** Set error context.
 */
 errorcontext="FPU:LU";

 /*
 ** Our first step is to build a "solvable" problem.  This
 ** will become the "seed" set that all others will be
 ** derived from. (I.E., we'll simply copy these arrays
 ** into the others.
 */
 a=(fardouble *)AllocateMemory(sizeof(double) * LUARRAYCOLS * LUARRAYROWS,
 		&systemerror);
 b=(fardouble *)AllocateMemory(sizeof(double) * LUARRAYROWS,
 		&systemerror);
 n=LUARRAYROWS;

 /*
 ** We need to allocate a temp vector that is used by the LU
 ** algorithm.  This removes the allocation routine from the
 ** timing.
 */
 LUtempvv=(fardouble *)AllocateMemory(sizeof(double)*LUARRAYROWS,
 	&systemerror);

 /*
 ** Build a problem to be solved.
 */
 ptra.ptrs.p=a;                  /* Gotta coerce linear array to 2D array */
 build_problem(*ptra.ptrs.ap,n,b);

 /*
 ** Now that we have a problem built, see if we need to do
 ** auto-adjust.  If so, repeatedly call the DoLUIteration routine,
 ** increasing the number of solutions per iteration as you go.
 */
 if(loclustruct->adjust==0)
 {
 	loclustruct->numarrays=0;
 	for(i=1;i<=MAXLUARRAYS;i++)
 	{
 		abase=(fardouble *)AllocateMemory(sizeof(double) *
 			LUARRAYCOLS*LUARRAYROWS*(i+1),&systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			LUFreeMem(a,b,(fardouble *)NULL,(fardouble *)NULL);
 			ErrorExit();
 		}
 		bbase=(fardouble *)AllocateMemory(sizeof(double) *
 			LUARRAYROWS*(i+1),&systemerror);
 		if(systemerror)
 		{       ReportError(errorcontext,systemerror);
 			LUFreeMem(a,b,abase,(fardouble *)NULL);
 			ErrorExit();
 		}
 		if(DoLUIteration(a,b,abase,bbase,i)>global_min_ticks)
 		{       loclustruct->numarrays=i;
 			break;
 		}
 		/*
 		** Not enough arrays...free them all and try again
 		*/
 		FreeMemory((farvoid *)abase,&systemerror);
 		FreeMemory((farvoid *)bbase,&systemerror);
 	}
 	/*
 	** Were we able to do it?
 	*/
 	if(loclustruct->numarrays==0)
 	{       printf("FPU:LU -- Array limit reached\n");
 		LUFreeMem(a,b,abase,bbase);
 		ErrorExit();
 	}
 }
 else
 {       /*
 	** Don't need to adjust -- just allocate the proper
 	** number of arrays and proceed.
 	*/
 	abase=(fardouble *)AllocateMemory(sizeof(double) *
 		LUARRAYCOLS*LUARRAYROWS*loclustruct->numarrays,
 		&systemerror);
 	if(systemerror)
 	{       ReportError(errorcontext,systemerror);
 		LUFreeMem(a,b,(fardouble *)NULL,(fardouble *)NULL);
 		ErrorExit();
 	}
 	bbase=(fardouble *)AllocateMemory(sizeof(double) *
 		LUARRAYROWS*loclustruct->numarrays,&systemerror);
 	if(systemerror)
 	{
 		ReportError(errorcontext,systemerror);
 		LUFreeMem(a,b,abase,(fardouble *)NULL);
 		ErrorExit();
 	}
 }
 /*
 ** All's well if we get here.  Do the test.
 */
 accumtime=0L;
 iterations=(double)0.0;

 do {
 	accumtime+=DoLUIteration(a,b,abase,bbase,
 		loclustruct->numarrays);
 	iterations+=(double)loclustruct->numarrays;
 } while(iterations < BASE_ITERATIONS * 20);

 /*
 ** Clean up, calculate results, and go home.  Be sure to
 ** show that we don't have to rerun adjustment code.
 */
 loclustruct->iterspersec=iterations / TicksToFracSecs(accumtime);

 if(loclustruct->adjust==0)
 	loclustruct->adjust=1;

 LUFreeMem(a,b,abase,bbase);
 return;
 }

 /**************
 ** LUFreeMem **
 ***************
 ** Release memory associated with LU benchmark.
 */
 static void LUFreeMem(fardouble *a, fardouble *b,
 			fardouble *abase,fardouble *bbase)
 {
 int systemerror;

 FreeMemory((farvoid *)a,&systemerror);
 FreeMemory((farvoid *)b,&systemerror);
 FreeMemory((farvoid *)LUtempvv,&systemerror);

 if(abase!=(fardouble *)NULL) FreeMemory((farvoid *)abase,&systemerror);
 if(bbase!=(fardouble *)NULL) FreeMemory((farvoid *)bbase,&systemerror);
 return;
 }

 /******************
 ** DoLUIteration **
 *******************
 ** Perform an iteration of the LU decomposition benchmark.
 ** An iteration refers to the repeated solution of several
 ** identical matrices.
 */
 static ulong DoLUIteration(fardouble *a,fardouble *b,
 		fardouble *abase, fardouble *bbase,
 		ulong numarrays)
 {
 fardouble *locabase;
 fardouble *locbbase;
 LUdblptr ptra;  /* For converting ptr to 2D array */
 ulong elapsed;
 ulong j,i;              /* Indexes */


 /*
 ** Move the seed arrays (a & b) into the destination
 ** arrays;
 */
 for(j=0;j<numarrays;j++)
 {       locabase=abase+j*LUARRAYROWS*LUARRAYCOLS;
 	locbbase=bbase+j*LUARRAYROWS;
 	for(i=0;i<LUARRAYROWS*LUARRAYCOLS;i++)
 		*(locabase+i)=*(a+i);
 	for(i=0;i<LUARRAYROWS;i++)
 		*(locbbase+i)=*(b+i);
 }

 /*
 ** Do test...begin timing.
 */
 elapsed=StartStopwatch();
 for(i=0;i<numarrays;i++)
 {       locabase=abase+i*LUARRAYROWS*LUARRAYCOLS;
 	locbbase=bbase+i*LUARRAYROWS;
 	ptra.ptrs.p=locabase;
 	lusolve(*ptra.ptrs.ap,LUARRAYROWS,locbbase);
 }

 return(StopStopwatch(elapsed));
 }

 /******************
 ** build_problem **
 *******************
 ** Constructs a solvable set of linear equations.  It does this by
 ** creating an identity matrix, then loading the solution vector
 ** with random numbers.  After that, the identity matrix and
 ** solution vector are randomly "scrambled".  Scrambling is
 ** done by (a) randomly selecting a row and multiplying that
 ** row by a random number and (b) adding one randomly-selected
 ** row to another.
 */
 static void build_problem(double a[][LUARRAYCOLS],
 		int n,
 		double b[LUARRAYROWS])
 {
 long i,j,k,k1;  /* Indexes */
 double rcon;     /* Random constant */

 /*
 ** Reset random number generator
 */
 /* randnum(13L); */
 randnum((int32)13);

 /*
 ** Build an identity matrix.
 ** We'll also use this as a chance to load the solution
 ** vector.
 */
 for(i=0;i<n;i++)
 {       /* b[i]=(double)(abs_randwc(100L)+1L); */
 	b[i]=(double)(abs_randwc((int32)100)+(int32)1);
 	for(j=0;j<n;j++)
 		if(i==j)
 		        /* a[i][j]=(double)(abs_randwc(1000L)+1L); */
 			a[i][j]=(double)(abs_randwc((int32)1000)+(int32)1);
 		else
 			a[i][j]=(double)0.0;
 }

 #ifdef DEBUG
 printf("Problem:\n");
 for(i=0;i<n;i++)
 {
 /*
 	for(j=0;j<n;j++)
 		printf("%6.2f ",a[i][j]);
 */
 	printf("%.0f/%.0f=%.2f\t",b[i],a[i][i],b[i]/a[i][i]);
 /*
         printf("\n");
 */
 }
 #endif

 /*
 ** Scramble.  Do this 8n times.  See comment above for
 ** a description of the scrambling process.
 */

 for(i=0;i<8*n;i++)
 {
 	/*
 	** Pick a row and a random constant.  Multiply
 	** all elements in the row by the constant.
 	*/
  /*       k=abs_randwc((long)n);
 	rcon=(double)(abs_randwc(20L)+1L);
 	for(j=0;j<n;j++)
 		a[k][j]=a[k][j]*rcon;
 	b[k]=b[k]*rcon;
 */
 	/*
 	** Pick two random rows and add second to
 	** first.  Note that we also occasionally multiply
 	** by minus 1 so that we get a subtraction operation.
 	*/
         /* k=abs_randwc((long)n); */
         /* k1=abs_randwc((long)n); */
 	k=abs_randwc((int32)n);
 	k1=abs_randwc((int32)n);
 	if(k!=k1)
 	{
 		if(k<k1) rcon=(double)1.0;
 			else rcon=(double)-1.0;
 		for(j=0;j<n;j++)
 			a[k][j]+=a[k1][j]*rcon;;
 		b[k]+=b[k1]*rcon;
 	}
 }

 return;
 }


 /***********
 ** ludcmp **
 ************
 ** From the procedure of the same name in "Numerical Recipes in Pascal",
 ** by Press, Flannery, Tukolsky, and Vetterling.
 ** Given an nxn matrix a[], this routine replaces it by the LU
 ** decomposition of a rowwise permutation of itself.  a[] and n
 ** are input.  a[] is output, modified as follows:
 **   --                       --
 **  |  b(1,1) b(1,2) b(1,3)...  |
 **  |  a(2,1) b(2,2) b(2,3)...  |
 **  |  a(3,1) a(3,2) b(3,3)...  |
 **  |  a(4,1) a(4,2) a(4,3)...  |
 **  |  ...                      |
 **   --                        --
 **
 ** Where the b(i,j) elements form the upper triangular matrix of the
 ** LU decomposition, and the a(i,j) elements form the lower triangular
 ** elements.  The LU decomposition is calculated so that we don't
 ** need to store the a(i,i) elements (which would have laid along the
 ** diagonal and would have all been 1).
 **
 ** indx[] is an output vector that records the row permutation
 ** effected by the partial pivoting; d is output as +/-1 depending
 ** on whether the number of row interchanges was even or odd,
 ** respectively.
 ** Returns 0 if matrix singular, else returns 1.
 */
 static int ludcmp(double a[][LUARRAYCOLS],
 		int n,
 		int indx[],
 		int *d)
 {

 double big;     /* Holds largest element value */
 double sum;
 double dum;     /* Holds dummy value */
 int i,j,k;      /* Indexes */
 int imax=0;     /* Holds max index value */
 double tiny;    /* A really small number */

 tiny=(double)1.0e-20;

 *d=1;           /* No interchanges yet */

 for(i=0;i<n;i++)
 {       big=(double)0.0;
 	for(j=0;j<n;j++)
 		if((double)fabs(a[i][j]) > big)
 			big=fabs(a[i][j]);
 	/* Bail out on singular matrix */
 	if(big==(double)0.0) return(0);
 	LUtempvv[i]=1.0/big;
 }

 /*
 ** Crout's algorithm...loop over columns.
 */
 for(j=0;j<n;j++)
 {       if(j!=0)
 		for(i=0;i<j;i++)
 		{       sum=a[i][j];
 			if(i!=0)
 				for(k=0;k<i;k++)
 					sum-=(a[i][k]*a[k][j]);
 			a[i][j]=sum;
 		}
 	big=(double)0.0;
 	for(i=j;i<n;i++)
 	{       sum=a[i][j];
 		if(j!=0)
 			for(k=0;k<j;k++)
 				sum-=a[i][k]*a[k][j];
 		a[i][j]=sum;
 		dum=LUtempvv[i]*fabs(sum);
 		if(dum>=big)
 		{       big=dum;
 			imax=i;
 		}
 	}
 	if(j!=imax)             /* Interchange rows if necessary */
 	{       for(k=0;k<n;k++)
 		{       dum=a[imax][k];
 			a[imax][k]=a[j][k];
 			a[j][k]=dum;
 		}
 		*d=-*d;         /* Change parity of d */
 		dum=LUtempvv[imax];
 		LUtempvv[imax]=LUtempvv[j]; /* Don't forget scale factor */
 		LUtempvv[j]=dum;
 	}
 	indx[j]=imax;
 	/*
 	** If the pivot element is zero, the matrix is singular
 	** (at least as far as the precision of the machine
 	** is concerned.)  We'll take the original author's
 	** recommendation and replace 0.0 with "tiny".
 	*/
 	if(a[j][j]==(double)0.0)
 		a[j][j]=tiny;

 	if(j!=(n-1))
 	{       dum=1.0/a[j][j];
 		for(i=j+1;i<n;i++)
 			a[i][j]=a[i][j]*dum;
 	}
 }

 return(1);
 }

 /***********
 ** lubksb **
 ************
 ** Also from "Numerical Recipes in Pascal".
 ** This routine solves the set of n linear equations A X = B.
 ** Here, a[][] is input, not as the matrix A, but as its
 ** LU decomposition, created by the routine ludcmp().
 ** Indx[] is input as the permutation vector returned by ludcmp().
 **  b[] is input as the right-hand side an returns the
 ** solution vector X.
 ** a[], n, and indx are not modified by this routine and
 ** can be left in place for different values of b[].
 ** This routine takes into account the possibility that b will
 ** begin with many zero elements, so it is efficient for use in
 ** matrix inversion.
 */
 static void lubksb( double a[][LUARRAYCOLS],
 		int n,
 		int indx[LUARRAYROWS],
 		double b[LUARRAYROWS])
 {

 int i,j;        /* Indexes */
 int ip;         /* "pointer" into indx */
 int ii;
 double sum;

 /*
 ** When ii is set to a positive value, it will become
 ** the index of the first nonvanishing element of b[].
 ** We now do the forward substitution. The only wrinkle
 ** is to unscramble the permutation as we go.
 */
 ii=-1;
 for(i=0;i<n;i++)
 {       ip=indx[i];
 	sum=b[ip];
 	b[ip]=b[i];
 	if(ii!=-1)
 		for(j=ii;j<i;j++)
 			sum=sum-a[i][j]*b[j];
 	else
 		/*
 		** If a nonzero element is encountered, we have
 		** to do the sums in the loop above.
 		*/
 		if(sum!=(double)0.0)
 			ii=i;
 	b[i]=sum;
 }
 /*
 ** Do backsubstitution
 */
 for(i=(n-1);i>=0;i--)
 {
 	sum=b[i];
 	if(i!=(n-1))
 		for(j=(i+1);j<n;j++)
 			sum=sum-a[i][j]*b[j];
 	b[i]=sum/a[i][i];
 }
 return;
 }

 /************
 ** lusolve **
 *************
 ** Solve a linear set of equations: A x = b
 ** Original matrix A will be destroyed by this operation.
 ** Returns 0 if matrix is singular, 1 otherwise.
 */
 static int lusolve(double a[][LUARRAYCOLS],
 		int n,
 		double b[LUARRAYROWS])
 {
 int indx[LUARRAYROWS];
 int d;
 #ifdef DEBUG
 int i,j;
 #endif

 if(ludcmp(a,n,indx,&d)==0) return(0);

 /* Matrix not singular -- proceed */
 lubksb(a,n,indx,b);

 #ifdef DEBUG
 printf("Solution:\n");
 for(i=0;i<n;i++)
 {
   for(j=0;j<n;j++){
   /*
     printf("%6.2f ",a[i][j]);
   */
   }
   printf("%6.2f\t",b[i]);
   /*
     printf("\n");
   */
 }
 printf("\n");
 #endif

 return(1);
 }