/* DCT testbed */
 /* testing 12 bit TRACE type data, stored in (nx,ny) image,
 assume nx and ny are multiples of 8 */
 /* various code fragments taken from jpeg-5a */
#include<stdio.h>
#include<string.h>
#include <math.h>
#include "defs.h"
#include "ana_structures.h"
#if __sgi
#include  <sys/types.h>
#include <malloc.h>
#endif
 extern	struct sym_desc sym[];
 extern	struct sym_list		*subr_sym_list[];
 extern	int	temp_base;
 extern	int	edb_context;
 extern	int	ana_float();
 extern	char *find_sym_name();
 extern	int	ana_type_size[];
 extern	int	vfix_top, num_ana_subr, next_user_subr_num;
 extern	float	float_arg();
 extern	double	double_arg();

 /* zag and zig are used in the re-ordering of the DCT coefficients, they
 are inverses of one another, zag is used in the compress side */
 /* zag[i] is the natural-order position of the i'th element of zigzag order. */
 
 static const int zag[64] = {
   0,  1,  8, 16,  9,  2,  3, 10,
  17, 24, 32, 25, 18, 11,  4,  5,
  12, 19, 26, 33, 40, 48, 41, 34,
  27, 20, 13,  6,  7, 14, 21, 28,
  35, 42, 49, 56, 57, 50, 43, 36,
  29, 22, 15, 23, 30, 37, 44, 51,
  58, 59, 52, 45, 38, 31, 39, 46,
  53, 60, 61, 54, 47, 55, 62, 63
 };
 
 /* zig[i] is the zigzag-order position of the i'th element of a DCT block */
 /* read in natural order (left to right, top to bottom). */
 static const int zig[64] = {
      0,  1,  5,  6, 14, 15, 27, 28,
      2,  4,  7, 13, 16, 26, 29, 42,
      3,  8, 12, 17, 25, 30, 41, 43,
      9, 11, 18, 24, 31, 40, 44, 53,
     10, 19, 23, 32, 39, 45, 52, 54,
     20, 22, 33, 38, 46, 51, 55, 60,
     21, 34, 37, 47, 50, 56, 59, 61,
     35, 36, 48, 49, 57, 58, 62, 63
 };
 /* zigtrans is the transpose of zig and is used as a destination pointer
 in some of the forward dct's */
 static int zigtrans[64] = {
	 0,   2,   3,   9,  10,  20,  21,  35,
	 1,   4,   8,  11,  19,  22,  34,  36,
	 5,   7,  12,  18,  23,  33,  37,  48,
	 6,  13,  17,  24,  32,  38,  47,  49,
	14,  16,  25,  31,  39,  46,  50,  57,
	15,  26,  30,  40,  45,  51,  56,  58,
	27,  29,  41,  44,  52,  55,  59,  62,
	28,  42,  43,  53,  54,  60,  61,  63,
 };
  static float aansf[8] = { 1.0, 1.387039845, 1.306562965, 1.175875602,
	   1.0, 0.785694958, 0.541196100, 0.275899379};

#define huff_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))

 static const int extend_test[16] =   /* entry n is 2**(n-1) */
  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };

 static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
 
 static void dct_cell(), rdct_cell(), int_dct_cell(), exp_dct_cell(), exp2_dct_cell();
/* #ifdef __alpha */
 void exp3_dct_cell(), exp4_dct_cell(), exp5_dct_cell();
/* #endif */
 /* workspace (ws) for the dct calcs, and a place for the modified q tables */
 /* should malloc these only if we need them ... */
 static float ws[64], fqtbl[64], jpeg_bias = 2048., bias = 2048.5;
 static int wsint[64], intqtb[64];
 static short inverseqt[64];
 static short wshort[64], wshort2[64];
 static short dctw[64] = {
 8192,  8192,  8192,  8192,  8192,  8192,  8192,   8192,
 11363, 9633,  6436,  2260, -2260, -6436, -9633, -11363,
 10703, 4433, -4433,-10703,-10703, -4433,  4433,  10703,
 9633, -2260,-11363, -6436,  6436, 11363,  2260,  -9633,
 8192, -8192, -8192,  8192, 8192,  -8192, -8192,   8192,
 6436,-11363,  2260,  9633, -9633, -2260, 11363,  -6436,
 4433,-10703, 10703, -4433, -4433, 10703, -10703,  4433,
 2260, -6436,  9633,-11363, 11363, -9633,   6436, -2260
 };
 /*------------------------------------------------------------------------*/
int fdct_exp(image, nx, ny, qtable, dct_array)
 /* int dct, quant, and re-order */
 /* exp. code for testing trace methods out */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 short	*qtable, *image, *dct_array;
 int	nx, ny;
 {
 int	n, i, ncx, iq, ystride, icx, icy, row, col;
 short	*dctstart;
 /* condition the q table for the output of the dct, taken from jcdctmgr.c */
 /* for this method, we used to multiply by 8, but now we do the extra 8
 divide in the dct step to be able to store a 16 bit result right away */
 for (i = 0; i < 64; i++) {
   intqtb[i] = (int) qtable[i]; /* just copy into I*4 */
   }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 exp_dct_cell(&image[iq], ystride, dctstart);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*--------------------------------------------------------------------------*/
int fdct_exp2(image, nx, ny, qtable, dct_array,test_flag)
 /* int dct, quant, and re-order */
 /* exp. code for testing trace methods out */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 short	*qtable, *image, *dct_array;
 int	nx, ny, test_flag;
 {
 int	n, i, ncx, iq, ystride, icx, icy, row, col;
 short	*dctstart;
 /* condition the q table for the output of the dct, taken from jcdctmgr.c */
 /* for this method, we used to multiply by 8, but now we do the extra 8
 divide in the dct step to be able to store a 16 bit result right away */
 for (i = 0; i < 64; i++) {
   intqtb[i] = (int) qtable[i]; /* just copy into I*4 */
   }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 exp2_dct_cell(&image[iq], ystride, dctstart, test_flag);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*--------------------------------------------------------------------------*/
 /* the exp3 routines are only defined for the alpha, they use 64 bit shifts
 and such */
/* #ifdef __alpha */
int ana_trace_expdct3(narg,ps)		/* function for exp dct tables */
 /* dct_array = ana_trace_expdct( image, qtable) */
 int	narg, ps[];
 /* calls trace_dct with mode = 5 for int dct */
 {  return trace_dct(narg,ps, 5); }
 /*---------------------------------------------------------------------------*/
int fdct_exp3(image, nx, ny, qtable, dct_array,test_flag)
 /* int dct, quant, and re-order */
 /* exp. code for testing trace methods out */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 short	*qtable, *image, *dct_array;
 int	nx, ny, test_flag;
 {
 int	n, i, ncx, iq, ystride, icx, icy, row, col;
 short	*dctstart;
 printf("in fdct_exp3\n");
 /* condition the q table for the output of the dct, taken from jcdctmgr.c */
 /* the exp3 version uses multiplies for the QT */
 for (i = 0; i < 64; i++) {
   inverseqt[i] = (short) ((0x00004000)/(int) qtable[i]); /* (2^14)/qtable */
   }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 exp3_dct_cell(&image[iq], ystride, dctstart, test_flag);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*--------------------------------------------------------------------------*/
int fdct_exp5(image, nx, ny, qtable, dct_array,test_flag)
 /* int dct, quant, and re-order */
 /* exp. code for testing trace methods out */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 short	*qtable, *image, *dct_array;
 int	nx, ny, test_flag;
 {
 int	n, i, ncx, iq, ystride, icx, icy, row, col;
 short	*dctstart;
 printf("in fdct_exp3\n");
 /* condition the q table for the output of the dct, taken from jcdctmgr.c */
 /* the exp5 version uses multiplies for the QT */
 for (i = 0; i < 64; i++) {
   inverseqt[i] = (short) ((0x00004000)/(int) qtable[i]); /* (2^14)/qtable */
   }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 exp5_dct_cell(&image[iq], ystride, dctstart, test_flag);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*--------------------------------------------------------------------------*/
void exp3_dct_cell(start, ystride, dctstart, test_flag)
 /* does dct and quant. for one cell */
 /* simple 1-D DCT's done as sum, sort of emulating the 16 bit operands
 and 48 bit accumulator in the IP */
 /* differs from exp2_dct_cell in treatment of quantization, test
 multiply rather than divide */
 short	*start, *dctstart;
 int	ystride, test_flag;
 {
 int i, j, n;
 short	int_bias = 2048; /* note this is I*2 in this routine */
 /* remove bias and stash in short workspace array wshort */
 {
  /* no problem doing this in I*2 */
  register short *wsptr;
  register short *elemptr;
  wsptr = wshort;	elemptr = start;	n = ystride - 8;
  for (i=0; i < 8; i++) {
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   elemptr += n;
 } }
 /* now the actual dct, use pre-generated weights, 8x8 array although we
 don't actually use the 0 row because it is constant, note
 that we take elements from wshort and put them in wshort2, row
 4 is constant except for sign, we just use it so that it looks
 like all the others */
  /* Pass 1: process rows. */
  {
  int	nq = 8;
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pw;
  int	acc;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    
    /* dc term, could use weights here and then it would look like
    the others; instead, just add and shift left by 1 */
    p = wsptr;	n = 8;	acc = 0;
    while(n--) acc += (int) *p++;	wsptr2[0] = (short) (acc << 1);
    
    /* start weight pointer at 8 because dc term was done different */
    p = wsptr;	n = 8;	acc = 2048;  pw = &dctw[8];
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[1*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[2*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[3*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[4*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[5*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[6*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[7*8] = (short) (acc >> 12);
  
    wsptr2++;
    wsptr +=  8;		/* advance pointer to next row */
  } }
 /* output these results for a test if test_flag set */
 if (test_flag)
 {
  register short *wsptr2 = wshort2;
  register short *dctptr = dctstart;
  for (i=0; i < 64; i++) {
  *dctptr++ = *wsptr2++;
  }
 return;
 }
  /* Pass 2: process columns. */
{
  /* here we go from wshort2 to wshort */
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pw, *qpt;
  int	*z = zigtrans;
  long long	acc;
  int	nq = 8;
  qpt = inverseqt;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    /* almost the same as row pass, except for buffers switched and
    a different shift amount to get final results */
    p = wsptr2;	n = 8;	acc = 0;
    while(n--) acc += (long long) *p++;
    dctstart[*z++] = (short) ( ((acc * (long long) qpt[0]) + 0x20000) >> 18);
    
    p = wsptr2;	n = 8;	acc = 0;  pw = &dctw[8];
    while(n--) acc += (long long)*p++ * (long long)*pw++;
    /* now include the quant. value (inverted) and round */
    acc = acc * (long long) qpt[1*8];
    acc = acc + 0x40000000;
    dctstart[*z++] = (short) ( acc >> 31);

    p = wsptr2;	n = 8;	acc = 0;
    while(n--) acc += (long long)*p++ * (long long)*pw++;
    acc = acc * (long long) qpt[2*8];
    acc = acc + 0x40000000;
    dctstart[*z++] = (short) (acc >> 31);

    p = wsptr2;	n = 8;	acc = 0;
    while(n--) acc += (long long)*p++ * (long long)*pw++;
    acc = acc * (long long) qpt[3*8];
    acc = acc + 0x40000000;
    dctstart[*z++] = (short) (acc >> 31);

    p = wsptr2;	n = 8;	acc = 0;
    while(n--) acc += (long long)*p++ * (long long)*pw++;
    acc = acc * (long long) qpt[4*8];
    acc = acc + 0x40000000;
    dctstart[*z++] = (short) (acc >> 31);

    p = wsptr2;	n = 8;	acc = 0;
    while(n--) acc += (long long)*p++ * (long long)*pw++;
    acc = acc * (long long) qpt[5*8];
    acc = acc + 0x40000000;
    dctstart[*z++] = (short) (acc >> 31);

    p = wsptr2;	n = 8;	acc = 0;
    while(n--) acc += (long long)*p++ * (long long)*pw++;
    acc = acc * (long long) qpt[6*8];
    acc = acc + 0x40000000;
    dctstart[*z++] = (short) (acc >> 31);

    p = wsptr2;	n = 8;	acc = 0;
    while(n--) acc += (long long)*p++ * (long long)*pw++;
    acc = acc * (long long) qpt[7*8];
    acc = acc + 0x40000000;
    dctstart[*z++] = (short) (acc >> 31);

    wsptr2 += 8;
    qpt++;			/* advance pointer to next column */
  } }  
 /* the zigzag and quantization were included in the second pass above */ 
 return;
 }
 /*---------------------------------------------------------------------------*/
void exp5_dct_cell(start, ystride, dctstart, test_flag)
 /* does dct and quant. for one cell */
 /* simple 1-D DCT's done as sum, sort of emulating the 16 bit operands
 and 48 bit accumulator in the IP */
 /* differs from exp2_dct_cell in treatment of quantization, test
 multiply rather than divide */
 /* the exp5_dct is similar to exp3_dct_cell but it mimics the DEP more
 closely, the acc for the column gets a 16 bit result which is then
 multiplied by the inverted QT; exp3_dct_cell did not use an intermediate
 16 bit result, the DEP has to */
 short	*start, *dctstart;
 int	ystride, test_flag;
 {
 int i, j, n;
 short	int_bias = 2048; /* note this is I*2 in this routine */
 /* remove bias and stash in short workspace array wshort */
 {
  /* no problem doing this in I*2 */
  register short *wsptr;
  register short *elemptr;
  wsptr = wshort;	elemptr = start;	n = ystride - 8;
  for (i=0; i < 8; i++) {
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   elemptr += n;
 } }
 /* now the actual dct, use pre-generated weights, 8x8 array although we
 don't actually use the 0 row because it is constant, note
 that we take elements from wshort and put them in wshort2, row
 4 is constant except for sign, we just use it so that it looks
 like all the others */
  /* Pass 1: process rows. */
  {
  int	nq = 8;
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pw;
  int	acc;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    
    /* dc term, could use weights here and then it would look like
    the others; instead, just add and shift left by 1 */
    p = wsptr;	n = 8;	acc = 0;
    while(n--) acc += (int) *p++;	wsptr2[0] = (short) (acc << 1);
    
    /* start weight pointer at 8 because dc term was done different */
    p = wsptr;	n = 8;	acc = 2048;  pw = &dctw[8];
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[1*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[2*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[3*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[4*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[5*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[6*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[7*8] = (short) (acc >> 12);
  
    wsptr2++;
    wsptr +=  8;		/* advance pointer to next row */
  } }
 /* output these results for a test if test_flag set */
 if (test_flag)
 {
  register short *wsptr2 = wshort2;
  register short *dctptr = dctstart;
  for (i=0; i < 64; i++) {
  *dctptr++ = *wsptr2++;
  }
 return;
 }
  /* Pass 2: process columns. */
{
  /* here we go from wshort2 to wshort */
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pw, *qpt;
  short		acc16;
  int	*z = zigtrans;
  int	acc;
  int	nq = 8;
  qpt = inverseqt;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    /* almost the same as row pass, except for buffers switched and
    a different shift amount to get final results */
    p = wsptr2;	n = 8;	acc = 8;
    while(n--) acc += (int) *p++;
    acc16 = (short) (acc >> 4);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[0]) + 0x2000) >> 14);
    
    p = wsptr2;	n = 8;	acc = 65536;  pw = &dctw[8];
    while(n--) acc += (int)*p++ * (int)*pw++;
    acc16 = (short) (acc >> 17);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[1*8]) + 0x2000) >> 14);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    acc16 = (short) (acc >> 17);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[2*8]) + 0x2000) >> 14);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    acc16 = (short) (acc >> 17);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[3*8]) + 0x2000) >> 14);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    acc16 = (short) (acc >> 17);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[4*8]) + 0x2000) >> 14);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    acc16 = (short) (acc >> 17);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[5*8]) + 0x2000) >> 14);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    acc16 = (short) (acc >> 17);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[6*8]) + 0x2000) >> 14);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    acc16 = (short) (acc >> 17);
    dctstart[*z++] = (short) ( (((int) acc16 * (int) qpt[7*8]) + 0x2000) >> 14);

    wsptr2 += 8;
    qpt++;			/* advance pointer to next column */
  } }  
 /* the zigzag and quantization were included in the second pass above */ 
 return;
 }
 /*---------------------------------------------------------------------------*/
int ana_trace_expdct4(narg,ps)		/* function for exp dct tables */
 /* dct_array = ana_trace_expdct( image, qtable) */
 int	narg, ps[];
 /* calls trace_dct with mode = 6 for int dct */
 {  return trace_dct(narg,ps, 6); }
 /*---------------------------------------------------------------------------*/
int ana_trace_expdct5(narg,ps)		/* function for exp dct tables */
 /* dct_array = ana_trace_expdct( image, qtable) */
 int	narg, ps[];
 /* calls trace_dct with mode = 7 for int dct */
 {  return trace_dct(narg,ps, 7); }
 /*---------------------------------------------------------------------------*/
 static short dctwqt[512];
int fdct_exp4(image, nx, ny, qtable, dct_array,test_flag)
 /* int dct, quant, and re-order */
 /* exp. code for testing trace methods out */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 short	*qtable, *image, *dct_array;
 int	nx, ny, test_flag;
 {
 int	n, i, j, ncx, iq, ystride, icx, icy, row, col, acc;
 short	*dctstart, *pwqt = dctwqt, *pw = dctw, *qpt;
 printf("in fdct_exp4\n");
 /* first get an inverse Q table */
 for (i = 0; i < 64; i++) {
   inverseqt[i] = (short) ((0x00004000)/(int) qtable[i]); /* (2^14)/qtable */
   }
 /* the exp4 version combines QT and weights into a 512 table for second
 pass */

 for (j = 0; j < 8; j++) {
 pw = dctw; 	qpt = inverseqt + j;
 for (i = 0; i < 8; i++) {
 n = 8;
  while (n--) {  acc = (int) *pw++ * (int) *qpt;  acc = (acc + 0x2000) >> 14;
  	*pwqt++ = (short) acc; }
  qpt = qpt + 8;
  }
  }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 exp4_dct_cell(&image[iq], ystride, dctstart, test_flag);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*--------------------------------------------------------------------------*/
void exp4_dct_cell(start, ystride, dctstart, test_flag)
 /* does dct and quant. for one cell */
 /* simple 1-D DCT's done as sum, sort of emulating the 16 bit operands
 and 48 bit accumulator in the IP */
 /* differs from exp2_dct_cell in treatment of quantization, test
 multiply rather than divide */
 short	*start, *dctstart;
 int	ystride, test_flag;
 {
 int i, j, n;
 short	int_bias = 2048; /* note this is I*2 in this routine */
 /* remove bias and stash in short workspace array wshort */
 {
  /* no problem doing this in I*2 */
  register short *wsptr;
  register short *elemptr;
  wsptr = wshort;	elemptr = start;	n = ystride - 8;
  for (i=0; i < 8; i++) {
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   elemptr += n;
 } }
 /* now the actual dct, use pre-generated weights, 8x8 array although we
 don't actually use the 0 row because it is constant, note
 that we take elements from wshort and put them in wshort2, row
 4 is constant except for sign, we just use it so that it looks
 like all the others */
  /* Pass 1: process rows. */
  {
  int	nq = 8;
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pw;
  int	acc;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    
    /* dc term, could use weights here and then it would look like
    the others; instead, just add and shift left by 1 */
    p = wsptr;	n = 8;	acc = 0;
    while(n--) acc += (int) *p++;	wsptr2[0] = (short) (acc << 1);
    
    /* start weight pointer at 8 because dc term was done different */
    p = wsptr;	n = 8;	acc = 2048;  pw = &dctw[8];
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[1*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[2*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[3*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[4*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[5*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[6*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[7*8] = (short) (acc >> 12);
  
    wsptr2++;
    wsptr +=  8;		/* advance pointer to next row */
  } }
 /* output these results for a test if test_flag set */
 if (test_flag)
 {
  register short *wsptr2 = wshort2;
  register short *dctptr = dctstart;
  for (i=0; i < 64; i++) {
  *dctptr++ = *wsptr2++;
  }
 return;
 }
  /* Pass 2: process columns. */
{
  /* here we go from wshort2 to wshort */
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pwqt = dctwqt;
  int	*z = zigtrans;
  int	acc;
  int	nq = 8, mq;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    /* use a 512 weight table which combines weights and QT */
    mq = 8;
    while (mq--) {
    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int) *p++ * (int) *pwqt++;
    dctstart[*z++] = (short) (acc >> 17);
    }
    wsptr2 += 8;
  } }  
 /* the zigzag and quantization were included in the second pass above */ 
 return;
 }
 /*---------------------------------------------------------------------------*/
/* #endif */
int fdct_int(image, nx, ny, qtable, dct_array)
 /* int dct, quant, and re-order */
 /* this uses the slow int method for 12 bit pixels */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 short	*qtable, *image, *dct_array;
 int	nx, ny;
 {
 int	n, i, ncx, iq, ystride, icx, icy, row, col;
 short	*dctstart;
 /* condition the q table for the output of the dct, taken from jcdctmgr.c */
 /* for this method, we just multiply by 8 */
 for (i = 0; i < 64; i++) {
   intqtb[i] = 8 * (int) qtable[i];
   }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 int_dct_cell(&image[iq], ystride, dctstart);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*-------------------------------------------------------------------------*/
int fdct_fp(image, nx, ny, qtable, dct_array)
 /* fp dct, quant, and re-order */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 float	*dct_array;
 short	*qtable, *image;
 int	nx, ny;
 {
 int	n, i, ncx, iq, ystride, icx, icy, row, col;
 float	*dctstart;
 /* condition the q table for the output of the dct, taken from jcdctmgr.c */
 /* this is set up inverted so we multiply rather than divide later */
 for (i = 0; i < 64; i++) {
   row = zag[i] >> 3;
   col = zag[i] & 7;
   fqtbl[i] = (0.125 / (((float) qtable[i] * aansf[row] * aansf[col])));
   }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 /* printf("iq = %d, dctstart - dct_array = %d\n", iq, dctstart - dct_array);*/
 dct_cell(&image[iq], ystride, dctstart);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*---------------------------------------------------------------------------*/
int rdct_fp(image, nx, ny, qtable, dct_array)
 /* reverse fp dct, quant, and re-order included, result is I*2 */
 /* nx and ny must be multiples of 8 (check in calling routine) */
 float	*dct_array;
 short	*qtable, *image;
 int	nx, ny;
 {
 int	n, i, ncx, iq, ystride, icx, icy, row, col;
 float	*dctstart;
 /* condition the q table for the dct's we created, must be same input qtable */
 i = 0;
 for (i = 0; i < 64; i++) {
   row = zag[i] >> 3;
   col = zag[i] & 7;
   fqtbl[i] = (float) qtable[i] * aansf[row] * aansf[col] * 0.125;
   }
 /* break out the cells and call dct_cell for each */
 n = nx*ny/64;	/* number of cells */
 ncx = nx/8;	ystride = nx;
 dctstart = dct_array;
 for (i=0; i < n; i++) {
 /* note that we access image by 8x8 blocks but dct is stored seq. */
 icx = i%ncx;		icy = i/ncx;	iq = icx*8 + icy*8*ystride;
 /*printf("iq = %d, dctstart - dct_array = %d\n", iq, dctstart - dct_array);*/
 rdct_cell(&image[iq], ystride, dctstart);
 dctstart += 64;
 }
 return 1;	/* success return */
 }
 /*---------------------------------------------------------------------------*/
void dct_cell(start, ystride, dctstart)
 /* does dct and quant. for one cell */
 short	*start;
 float	*dctstart;
 int	ystride;
 {
 int i, j, n;
 float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 float tmp10, tmp11, tmp12, tmp13;
 float z1, z2, z3, z4, z5, z11, z13;

 /* remove bias and stash in workspace array ws */
 {
  register float *wsptr;
  register short *elemptr;
  wsptr = ws;	elemptr = start;	n = ystride - 8;
  for (i=0; i < 8; i++) {
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   *wsptr++ = (float) *elemptr++ - jpeg_bias;
   elemptr += n;
 } }
 /* now the actual dct */
  /* Pass 1: process rows. */
  {
  register float *wsptr = ws;
  int	nq = 8;
  while (nq--) {
    tmp0 = wsptr[0] + wsptr[7];
    tmp7 = wsptr[0] - wsptr[7];
    tmp1 = wsptr[1] + wsptr[6];
    tmp6 = wsptr[1] - wsptr[6];
    tmp2 = wsptr[2] + wsptr[5];
    tmp5 = wsptr[2] - wsptr[5];
    tmp3 = wsptr[3] + wsptr[4];
    tmp4 = wsptr[3] - wsptr[4];
    
    /* Even part */
    
    tmp10 = tmp0 + tmp3;	/* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    
    wsptr[0] = tmp10 + tmp11; /* phase 3 */
    wsptr[4] = tmp10 - tmp11;
    
    z1 = (tmp12 + tmp13) * ( 0.707106781); /* c4 */
    wsptr[2] = tmp13 + z1;	/* phase 5 */
    wsptr[6] = tmp13 - z1;
    
    /* Odd part */

    tmp10 = tmp4 + tmp5;	/* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
    z5 = (tmp10 - tmp12) * ( 0.382683433); /* c6 */
    z2 = ( 0.541196100) * tmp10 + z5; /* c2-c6 */
    z4 = ( 1.306562965) * tmp12 + z5; /* c2+c6 */
    z3 = tmp11 * ( 0.707106781); /* c4 */

    z11 = tmp7 + z3;		/* phase 5 */
    z13 = tmp7 - z3;

    wsptr[5] = z13 + z2;	/* phase 6 */
    wsptr[3] = z13 - z2;
    wsptr[1] = z11 + z4;
    wsptr[7] = z11 - z4;

    wsptr += 8;		/* advance pointer to next row */
  } }

  /* Pass 2: process columns. */
  {
  register float *wsptr = ws;
  int	nq = 8;
  while (nq--) {
    tmp0 = wsptr[0] + wsptr[8*7];
    tmp7 = wsptr[0] - wsptr[8*7];
    tmp1 = wsptr[8] + wsptr[8*6];
    tmp6 = wsptr[8] - wsptr[8*6];
    tmp2 = wsptr[8*2] + wsptr[8*5];
    tmp5 = wsptr[8*2] - wsptr[8*5];
    tmp3 = wsptr[8*3] + wsptr[8*4];
    tmp4 = wsptr[8*3] - wsptr[8*4];
    
    /* Even part */
    
    tmp10 = tmp0 + tmp3;	/* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    
    wsptr[0] = tmp10 + tmp11; /* phase 3 */
    wsptr[8*4] = tmp10 - tmp11;
    
    z1 = (tmp12 + tmp13) * ( 0.707106781); /* c4 */
    wsptr[8*2] = tmp13 + z1; /* phase 5 */
    wsptr[8*6] = tmp13 - z1;
    
    /* Odd part */

    tmp10 = tmp4 + tmp5;	/* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
    z5 = (tmp10 - tmp12) * ( 0.382683433); /* c6 */
    z2 = ( 0.541196100) * tmp10 + z5; /* c2-c6 */
    z4 = ( 1.306562965) * tmp12 + z5; /* c2+c6 */
    z3 = tmp11 * ( 0.707106781); /* c4 */

    z11 = tmp7 + z3;		/* phase 5 */
    z13 = tmp7 - z3;

    wsptr[8*5] = z13 + z2; /* phase 6 */
    wsptr[8*3] = z13 - z2;
    wsptr[8*1] = z11 + z4;
    wsptr[8*7] = z11 - z4;

    wsptr++;			/* advance pointer to next column */
  } }

 /* quantize and re-order */
 /* for this fp test, we do not convert to int but save the "quantized"
 fp DCT for subsequent scrutiny */
 { register float *wsptr, *dctptr;
  wsptr = ws;	dctptr = dctstart;
  for (i=0; i < 64; i++) {
  *dctptr++ = wsptr[zag[i]] * fqtbl[i];
  }
 }
 }
 /*---------------------------------------------------------------------------*/
void int_dct_cell(start, ystride, dctstart)
 /* does dct and quant. for one cell */
 short	*start, *dctstart;
 int	ystride;
 {
 int i, j, n;
 int	int_bias = 2048;
 int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 int tmp10, tmp11, tmp12, tmp13;
 int z1, z2, z3, z4, z5;
#define FIX_0_298631336  ((int)  2446)
#define FIX_0_390180644  ((int)  3196)
#define FIX_0_541196100  ((int)  4433)
#define FIX_0_765366865  ((int)  6270)
#define FIX_0_899976223  ((int)  7373)
#define FIX_1_175875602  ((int)  9633)
#define FIX_1_501321110  ((int)  12299)
#define FIX_1_847759065  ((int)  15137)
#define FIX_1_961570560  ((int)  16069)
#define FIX_2_053119869  ((int)  16819)
#define FIX_2_562915447  ((int)  20995)
#define FIX_3_072711026  ((int)  25172)
#define CONST_BITS  13
#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
 /* some of the jpeg-5a macros are handy, note SHIFT_TEMPS is blank */
#define SHIFT_TEMPS
#define RIGHT_SHIFT(x,shft)	((x) >> (shft))
#define ONE	((int) 1)
#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)

 /* remove bias and stash in workspace array ws */
 {
  register int *wsptr;
  register short *elemptr;
  wsptr = wsint;	elemptr = start;	n = ystride - 8;
  for (i=0; i < 8; i++) {
   *wsptr++ =  (int) *elemptr++ - int_bias;
   *wsptr++ =  (int) *elemptr++ - int_bias;
   *wsptr++ =  (int) *elemptr++ - int_bias;
   *wsptr++ =  (int) *elemptr++ - int_bias;
   *wsptr++ =  (int) *elemptr++ - int_bias;
   *wsptr++ =  (int) *elemptr++ - int_bias;
   *wsptr++ =  (int) *elemptr++ - int_bias;
   *wsptr++ =  (int) *elemptr++ - int_bias;
   elemptr += n;
 } }
 /* now the actual dct */
  /* Pass 1: process rows. */
  {
  register int *wsptr = wsint;
  int	nq = 8;
  while (nq--) {
    tmp0 = wsptr[0] + wsptr[7];
    tmp7 = wsptr[0] - wsptr[7];
    tmp1 = wsptr[1] + wsptr[6];
    tmp6 = wsptr[1] - wsptr[6];
    tmp2 = wsptr[2] + wsptr[5];
    tmp5 = wsptr[2] - wsptr[5];
    tmp3 = wsptr[3] + wsptr[4];
    tmp4 = wsptr[3] - wsptr[4];
    
    /* Even part */
    
    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    
    /* we can only shift by 1 bit in each coeff. if the constants use 13,
    or overflows in 32 bit regs, this may lose a little */
    wsptr[0] = (tmp10 + tmp11) << PASS1_BITS;
    wsptr[4] = (tmp10 - tmp11) << PASS1_BITS;
    
    z1 = (tmp12 + tmp13) * FIX_0_541196100;
    wsptr[2] = DESCALE( z1 + tmp13 * FIX_0_765366865, CONST_BITS-PASS1_BITS);
    wsptr[6] = DESCALE( z1 - tmp12 * FIX_1_847759065, CONST_BITS-PASS1_BITS);
    
    /* Odd part */
    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = (z3 + z4) * FIX_1_175875602;
    
    tmp4 = tmp4 * FIX_0_298631336;
    tmp5 = tmp5 * FIX_2_053119869;
    tmp6 = tmp6 * FIX_3_072711026;
    tmp7 = tmp7 * FIX_1_501321110;
    z1 = z1 * - FIX_0_899976223;
    z2 = z2 * - FIX_2_562915447;
    z3 = z3 * - FIX_1_961570560;
    z4 = z4 * - FIX_0_390180644;
    
    z3 += z5;
    z4 += z5;

    wsptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
    wsptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
    wsptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
    wsptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);

    wsptr += 8;		/* advance pointer to next row */
  } }

  /* Pass 2: process columns. */
  {
  register int *wsptr = wsint;
  int	nq = 8;
  while (nq--) {
    tmp0 = wsptr[0] + wsptr[8*7];
    tmp7 = wsptr[0] - wsptr[8*7];
    tmp1 = wsptr[8] + wsptr[8*6];
    tmp6 = wsptr[8] - wsptr[8*6];
    tmp2 = wsptr[8*2] + wsptr[8*5];
    tmp5 = wsptr[8*2] - wsptr[8*5];
    tmp3 = wsptr[8*3] + wsptr[8*4];
    tmp4 = wsptr[8*3] - wsptr[8*4];
    
    /* Even part */
    
    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    
    wsptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
    wsptr[8*4] = DESCALE(tmp10 - tmp11, PASS1_BITS);
    
    z1 = (tmp12 + tmp13) * FIX_0_541196100;
    wsptr[8*2] = DESCALE( z1 + tmp13 * FIX_0_765366865, CONST_BITS+PASS1_BITS);
    wsptr[8*6] = DESCALE( z1 - tmp12 * FIX_1_847759065, CONST_BITS+PASS1_BITS);
    
    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = (z3 + z4) * FIX_1_175875602;
    
    tmp4 = tmp4 * FIX_0_298631336;
    tmp5 = tmp5 * FIX_2_053119869;
    tmp6 = tmp6 * FIX_3_072711026;
    tmp7 = tmp7 * FIX_1_501321110;
    z1 = z1 *  - FIX_0_899976223;
    z2 = z2 *  - FIX_2_562915447;
    z3 = z3 *  - FIX_1_961570560;
    z4 = z4 *  - FIX_0_390180644;
    
    z3 += z5;
    z4 += z5;
    
    wsptr[8*7] =  DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS);
    wsptr[8*5] =  DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS);
    wsptr[8*3] =  DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS);
    wsptr[8*1] =  DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS);

    wsptr++;			/* advance pointer to next column */
  } }

 /* quantize and re-order */
 {
  register int *wsptr;
  register short *dctptr;
  int	qval, temp;
  wsptr = wsint;	dctptr = dctstart;
  for (i=0; i < 64; i++) {
  /* this coded similar to the jpeg-5a code but with the zero checks */
  /* a lot done for rounding */
  qval = intqtb[i];
  temp = wsptr[zag[i]];
  if (temp < 0) {
	  temp = -temp;
	  temp += qval>>1;	/* for rounding */
	  temp = temp/qval;
	  temp = -temp;
  } else {
	  temp += qval>>1;	/* for rounding */
	  temp = temp/qval;
  }

  *dctptr++ = (short) temp;
  }
 }
 }
 /*---------------------------------------------------------------------------*/
void exp_dct_cell(start, ystride, dctstart)
 /* does dct and quant. for one cell */
 short	*start, *dctstart;
 int	ystride;
 {
 int i, j, n;
 short	int_bias = 2048; /* note this is I*2 in this routine */
 int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 int tmp10, tmp11, tmp12, tmp13;
 int z1, z2, z3, z4, z5;
#define FIX_0_298631336  ((int)  2446)
#define FIX_0_390180644  ((int)  3196)
#define FIX_0_541196100  ((int)  4433)
#define FIX_0_765366865  ((int)  6270)
#define FIX_0_899976223  ((int)  7373)
#define FIX_1_175875602  ((int)  9633)
#define FIX_1_501321110  ((int)  12299)
#define FIX_1_847759065  ((int)  15137)
#define FIX_1_961570560  ((int)  16069)
#define FIX_2_053119869  ((int)  16819)
#define FIX_2_562915447  ((int)  20995)
#define FIX_3_072711026  ((int)  25172)
#define CONST_BITS  13
#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
 /* some of the jpeg-5a macros are handy, note SHIFT_TEMPS is blank */
#define SHIFT_TEMPS
#define RIGHT_SHIFT(x,shft)	((x) >> (shft))
#define ONE	((int) 1)
#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
 /* this is experimental version, uses some I*2 rather than all I*4 */
 /* remove bias and stash in short workspace array wshort */
 {
  /* no problem doing this in I*2 */
  register short *wsptr;
  register short *elemptr;
  wsptr = wshort;	elemptr = start;	n = ystride - 8;
  for (i=0; i < 8; i++) {
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   elemptr += n;
 } }
 /* now the actual dct */
  /* Pass 1: process rows. */
  {
  /* do as many as possible in I*2, results get stored in I*2 for column pass */
  /* avoid 32x32 multiplies, use 16x16 to 32 (sort of emulated here by
  doing (int)'s on I*2's before the multiply */
  register short *wsptr = wshort;
  register int *wsptr2 = wsint;
  short tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  short tmp10, tmp11, tmp12, tmp13, y1, y2, y3, y4;
  /* note that the z temps are I*4 while the tmp's and y's are I*2 */
  int z1, z2, z3, z4, z5;
  int	nq = 8;
  while (nq--) {
    tmp0 = wsptr[0] + wsptr[7];	/* need 13 bits here */
    tmp7 = wsptr[0] - wsptr[7];
    tmp1 = wsptr[1] + wsptr[6];
    tmp6 = wsptr[1] - wsptr[6];
    tmp2 = wsptr[2] + wsptr[5];
    tmp5 = wsptr[2] - wsptr[5];
    tmp3 = wsptr[3] + wsptr[4];
    tmp4 = wsptr[3] - wsptr[4];
    /* Even part */
    
    tmp10 = tmp0 + tmp3; /* up to 14 bits */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    
    /* we can only shift by 1 bit in each coeff. if the constants use 13,
    or overflows in 32 bit regs, this may lose a little */
    /* these 2 could are stored as 16 bits */
    wsptr[0] =  ((tmp10 + tmp11)) << PASS1_BITS;
    wsptr[4] =  ((tmp10 - tmp11)) << PASS1_BITS;
    
    /* a 16 add and then a 16x16->32, but here we have to convert first */
    z1 = (int) (tmp12 + tmp13) * FIX_0_541196100;
    /* keep z1 in accum, and add products below */
    wsptr[2] = (short) DESCALE( z1 + (int) tmp13 * FIX_0_765366865,
    					CONST_BITS-PASS1_BITS);
    wsptr[6] = (short) DESCALE( z1 - (int) tmp12 * FIX_1_847759065,
    					CONST_BITS-PASS1_BITS);
    
    /* Odd part */
    y1 = tmp4 + tmp7;	/* more 16 bit adds, use 14 bits */
    y2 = tmp5 + tmp6;
    y3 = tmp4 + tmp6;
    y4 = tmp5 + tmp7;
    z5 = (int) ((y3 + y4)) * FIX_1_175875602;
    
    z1 = (int) y1 * - FIX_0_899976223;
    z2 = (int) y2 * - FIX_2_562915447;
    z3 = (int) y3 * - FIX_1_961570560;
    z4 = (int) y4 * - FIX_0_390180644;
    
    z3 += z5;
    z4 += z5;

    wsptr[7] = (short) DESCALE((int) tmp4 * FIX_0_298631336 + z1 + z3,
    			CONST_BITS-PASS1_BITS);
    wsptr[5] = (short) DESCALE((int) tmp5 * FIX_2_053119869 + z2 + z4,
    			CONST_BITS-PASS1_BITS);
    wsptr[3] = (short) DESCALE((int) tmp6 * FIX_3_072711026 + z2 + z3,
    			CONST_BITS-PASS1_BITS);
    wsptr[1] = (short) DESCALE((int) tmp7 * FIX_1_501321110 + z1 + z4,
    			CONST_BITS-PASS1_BITS);

    wsptr2 += 8;
    wsptr +=  8;		/* advance pointer to next row */
  } }

  /* Pass 2: process columns. */
  {
  register short *wsptr = wshort;
  /* register int *wsptr2 = wsint; */
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1, z2, z3, z4, z5;
  int	nq = 8;
  while (nq--) {
    tmp0 = (int) wsptr[0] + (int) wsptr[8*7];
    tmp7 = (int) wsptr[0] - (int) wsptr[8*7];
    tmp1 = (int) wsptr[8] + (int) wsptr[8*6];
    tmp6 = (int) wsptr[8] - (int) wsptr[8*6];
    tmp2 = (int) wsptr[8*2] + (int) wsptr[8*5];
    tmp5 = (int) wsptr[8*2] - (int) wsptr[8*5];
    tmp3 = (int) wsptr[8*3] + (int) wsptr[8*4];
    tmp4 = (int) wsptr[8*3] - (int) wsptr[8*4];
    
    /* Even part */
    
    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    
    /* to store in a 16 bit result, we do the x8 descale here instead of
    multiplying the q table by 8 */
    wsptr[0]   = DESCALE(tmp10 + tmp11, PASS1_BITS+3);
    wsptr[8*4] = DESCALE(tmp10 - tmp11, PASS1_BITS+3);
    
    z1 = (tmp12 + tmp13) * FIX_0_541196100;
    wsptr[8*2] = DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS+PASS1_BITS+3);
    wsptr[8*6] = DESCALE(z1 - tmp12 * FIX_1_847759065, CONST_BITS+PASS1_BITS+3);
    
    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = (z3 + z4) * FIX_1_175875602;
    
    tmp4 = tmp4 * FIX_0_298631336;
    tmp5 = tmp5 * FIX_2_053119869;
    tmp6 = tmp6 * FIX_3_072711026;
    tmp7 = tmp7 * FIX_1_501321110;
    z1 = z1 *  - FIX_0_899976223;
    z2 = z2 *  - FIX_2_562915447;
    z3 = z3 *  - FIX_1_961570560;
    z4 = z4 *  - FIX_0_390180644;
    
    z3 += z5;
    z4 += z5;
    
    wsptr[8*7] =  DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS+3);
    wsptr[8*5] =  DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS+3);
    wsptr[8*3] =  DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS+3);
    wsptr[8*1] =  DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS+3);

    wsptr++;			/* advance pointer to next column */
  } }

 /* quantize and re-order */
 {
  register short *wsptr;
  register short *dctptr;
  int	qval, temp;
  wsptr = wshort;	dctptr = dctstart;
  for (i=0; i < 64; i++) {
  /* this coded similar to the jpeg-5a code but without the zero checks */
  /* a lot done for rounding */
  qval = intqtb[i];
  temp = (int) wsptr[zag[i]];
  if (temp < 0) {
	  temp = -temp;
	  temp += qval>>1;	/* for rounding */
	  temp = temp/qval;
	  temp = -temp;
  } else {
	  temp += qval>>1;	/* for rounding */
	  temp = temp/qval;
  }

  *dctptr++ = (short) temp;
  }
 }
 }
void exp2_dct_cell(start, ystride, dctstart, test_flag)
 /* does dct and quant. for one cell */
 /* simple 1-D DCT's done as sum, sort of emulating the 16 bit operands
 and 48 bit accumulator in the IP */
 short	*start, *dctstart;
 int	ystride, test_flag;
 {
 int i, j, n;
 short	int_bias = 2048; /* note this is I*2 in this routine */
 /* remove bias and stash in short workspace array wshort */
 {
  /* no problem doing this in I*2 */
  register short *wsptr;
  register short *elemptr;
  wsptr = wshort;	elemptr = start;	n = ystride - 8;
  for (i=0; i < 8; i++) {
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   *wsptr++ =  *elemptr++ - int_bias;
   elemptr += n;
 } }
 /* now the actual dct, use pre-generated weights, 8x8 array although we
 don't actually use the 0 row because it is constant, note
 that we take elements from wshort and put them in wshort2, row
 4 is constant except for sign, we just use it so that it looks
 like all the others */
  /* Pass 1: process rows. */
  {
  int	nq = 8;
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pw;
  int	acc;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    
    /* dc term, could use weights here and then it would look like
    the others; instead, just add and shift left by 1 */
    p = wsptr;	n = 8;	acc = 0;
    while(n--) acc += (int) *p++;	wsptr2[0] = (short) (acc << 1);
    
    /* start weight pointer at 8 because dc term was done different */
    p = wsptr;	n = 8;	acc = 2048;  pw = &dctw[8];
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[1*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[2*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[3*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[4*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[5*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[6*8] = (short) (acc >> 12);
    
    p = wsptr;	n = 8;	acc = 2048;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr2[7*8] = (short) (acc >> 12);
  
    wsptr2++;
    wsptr +=  8;		/* advance pointer to next row */
  } }
 /* output these results for a test if test_flag set */
 if (test_flag)
 {
  register short *wsptr2 = wshort2;
  register short *dctptr = dctstart;
  for (i=0; i < 64; i++) {
  *dctptr++ = *wsptr2++;
  }
 return;
 }
  /* Pass 2: process columns. */
  {
  /* here we go from wshort2 to wshort */
  register short *wsptr = wshort;
  register short *wsptr2 = wshort2, *p, *pw;
  int	acc;
  int	nq = 8;
  while (nq--) {
    /* actually do 8 sets of 8 multiply/sums for each result */
    /* almost the same as row pass, except for buffers switched and
    a different shift amount to get final results */
    p = wsptr2;	n = 8;	acc = 8;
    while(n--) acc += (int) *p++;	wsptr[0] = (short) (acc >> 4);
    
    p = wsptr2;	n = 8;	acc = 65536;  pw = &dctw[8];
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr[1*8] = (short) (acc >> 17);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr[2*8] = (short) (acc >> 17);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr[3*8] = (short) (acc >> 17);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr[4*8] = (short) (acc >> 17);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr[5*8] = (short) (acc >> 17);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr[6*8] = (short) (acc >> 17);

    p = wsptr2;	n = 8;	acc = 65536;
    while(n--) acc += (int)*p++ * (int)*pw++;
    wsptr[7*8] = (short) (acc >> 17);

    wsptr2 += 8;
    wsptr++;			/* advance pointer to next column */
  } }

 /* quantize and re-order */
 {
  register short *wsptr;
  register short *dctptr;
  int	qval, temp;
  wsptr = wshort;	dctptr = dctstart;
  for (i=0; i < 64; i++) {
  /* this coded similar to the jpeg-5a code but without the zero checks */
  /* the coeffs are put into zig/zag order */
  /* a lot done for rounding */
  qval = intqtb[i];
  temp = (int) wsptr[zag[i]];
  if (temp < 0) {
	  temp = -temp;
	  temp += qval>>1;	/* for rounding */
	  temp = temp/qval;
	  temp = -temp;
  } else {
	  temp += qval>>1;	/* for rounding */
	  temp = temp/qval;
  }

  *dctptr++ = (short) temp;
  }
 }
 return;
 }
 /*---------------------------------------------------------------------------*/
void rdct_cell(start, ystride, dctstart)
 /* does reverse quant. and dct for one cell */
 short	*start;
 float	*dctstart;
 int	ystride;
 {
 int i, j, n;
 float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 float tmp10, tmp11, tmp12, tmp13;
 float z1, z2, z3, z4, z5, z11, z13, z10, z12;

 /* first de-zag and de-quantize */
 { register float *wsptr, *dctptr;
  wsptr = ws;	dctptr = dctstart;
  for (i=0; i < 64; i++) {
  wsptr[zag[i]] = *dctptr++ * fqtbl[i];
  }
 }
  /* Pass 1: process columns. */
  /* we don't check for columns of zeroes since this usually uses full
  precision */
  {
  register float *wsptr = ws;
  register float *fqtptr = fqtbl;
  int	nq = 8;
  while (nq--) {
    tmp0 = wsptr[0];
    tmp1 = wsptr[8*2];
    tmp2 = wsptr[8*4];
    tmp3 = wsptr[8*6];

    tmp10 = tmp0 + tmp2;	/* phase 3 */
    tmp11 = tmp0 - tmp2;

    tmp13 = tmp1 + tmp3;	/* phases 5-3 */
    tmp12 = (tmp1 - tmp3) *  1.414213562 - tmp13; /* 2*c4 */

    tmp0 = tmp10 + tmp13;	/* phase 2 */
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;
    
    /* Odd part */

    tmp4 = wsptr[8];
    tmp5 = wsptr[8*3];
    tmp6 = wsptr[8*5];
    tmp7 = wsptr[8*7];

    z13 = tmp6 + tmp5;		/* phase 6 */
    z10 = tmp6 - tmp5;
    z11 = tmp4 + tmp7;
    z12 = tmp4 - tmp7;

    tmp7 = z11 + z13;		/* phase 5 */
    tmp11 = (z11 - z13) * ( 1.414213562); /* 2*c4 */

    z5 = (z10 + z12) * ( 1.847759065); /* 2*c2 */
    tmp10 = ( 1.082392200) * z12 - z5; /* 2*(c2-c6) */
    tmp12 = ( -2.613125930) * z10 + z5; /* -2*(c2+c6) */

    tmp6 = tmp12 - tmp7;	/* phase 2 */
    tmp5 = tmp11 - tmp6;
    tmp4 = tmp10 + tmp5;

    wsptr[0]   = tmp0 + tmp7;
    wsptr[8*7] = tmp0 - tmp7;
    wsptr[8]   = tmp1 + tmp6;
    wsptr[8*6] = tmp1 - tmp6;
    wsptr[8*2] = tmp2 + tmp5;
    wsptr[8*5] = tmp2 - tmp5;
    wsptr[8*4] = tmp3 + tmp4;
    wsptr[8*3] = tmp3 - tmp4;

    fqtptr++;
    wsptr++;		/* advance pointers to next column */
  } }

  /* Pass 2: process rows. */
  {
  register float *wsptr;
  register short *elemptr;
  int	nq = 8;
  wsptr = ws;	elemptr = start;
  while (nq--) {
      /* Even part */

    tmp10 = wsptr[0] + wsptr[4] + bias;
    tmp11 = wsptr[0] - wsptr[4] + bias;

    tmp13 = wsptr[2] + wsptr[6];
    tmp12 = (wsptr[2] - wsptr[6]) * ( 1.414213562) - tmp13;

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;

    /* Odd part */

    z13 = wsptr[5] + wsptr[3];
    z10 = wsptr[5] - wsptr[3];
    z11 = wsptr[1] + wsptr[7];
    z12 = wsptr[1] - wsptr[7];

    tmp7 = z11 + z13;
    tmp11 = (z11 - z13) * ( 1.414213562);

    z5 = (z10 + z12) * ( 1.847759065); /* 2*c2 */
    tmp10 = ( 1.082392200) * z12 - z5; /* 2*(c2-c6) */
    tmp12 = ( -2.613125930) * z10 + z5; /* -2*(c2+c6) */

    tmp6 = tmp12 - tmp7;
    tmp5 = tmp11 - tmp6;
    tmp4 = tmp10 + tmp5;

    /* Final output stage, note bias was added in above */
    /* we don't range limit since results should be near exact */
    elemptr[0] = (short) (tmp0 + tmp7);
    elemptr[7] = (short) (tmp0 - tmp7);
    elemptr[1] = (short) (tmp1 + tmp6);
    elemptr[6] = (short) (tmp1 - tmp6);
    elemptr[2] = (short) (tmp2 + tmp5);
    elemptr[5] = (short) (tmp2 - tmp5);
    elemptr[4] = (short) (tmp3 + tmp4);
    elemptr[3] = (short) (tmp3 - tmp4);

    wsptr += 8;
    elemptr += ystride;		/* to next row */
  } }
 }
 /*---------------------------------------------------------------------------*/
int ana_trace_fpdct(narg,ps)		/* function for fp dct tables */
 /* dct_array = ana_trace_fpdct( image, qtable) */
 int	narg, ps[];
 /* calls trace_dct with mode = 1 for fp dct */
 {  return trace_dct(narg,ps, 1); }
 /*---------------------------------------------------------------------------*/
int ana_trace_intdct(narg,ps)		/* function for int dct tables */
 /* dct_array = ana_trace_fpdct( image, qtable) */
 int	narg, ps[];
 /* calls trace_dct with mode = 2 for int dct */
 {  return trace_dct(narg,ps, 2); }
 /*---------------------------------------------------------------------------*/
int ana_trace_expdct(narg,ps)		/* function for exp dct tables */
 /* dct_array = ana_trace_expdct( image, qtable) */
 int	narg, ps[];
 /* calls trace_dct with mode = 3 for int dct */
 {  return trace_dct(narg,ps, 3); }
 /*---------------------------------------------------------------------------*/
int ana_trace_expdct2(narg,ps)		/* function for exp dct tables */
 /* dct_array = ana_trace_expdct( image, qtable) */
 int	narg, ps[];
 /* calls trace_dct with mode = 4 for int dct */
 {  return trace_dct(narg,ps, 4); }
 /*---------------------------------------------------------------------------*/
int trace_dct(narg,ps,mode)		/* internal routine for dct tables */
 /* mode = 1 for fp dct, 2 for I*2 dct, 3 for experimental version */
 int	narg, ps[];
 {
 short	*image, *qtable, *dct_intarray;
 float	*dct_array;
 int	nd, nx, ny, i, j, n, dim[8], iq, result_sym, test_flag=0;
 struct	ahead	*h;
 /* first arg must be a 2-D I*2 array */
 iq = ps[0];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 1 )  return execute_error(111);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 image = (short *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 if (nd != 2) return execute_error(101);
 nx = h->dims[0]; ny = h->dims[1];
 if ( nx%8 != 0 || nx%8 != 0) {
  printf("image dimensions must be a multiple of 8, nx, ny = %d %d\n",nx,ny);
  return 0;
 }
 /* second arg must also be a I*2 array of length 64  */
 iq = ps[1];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 1 )  return execute_error(111);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 qtable = (short *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 n = 1; for (j=0;j<nd;j++) n *= h->dims[j]; /*# of elements for nsym */
 if ( n != 64) {
 	printf("quantization table must be 64 long, n = %d\n", n);
	return -1;
 }
 /* an optional 3rd argument can be a test flag */
 if (narg>2) if (int_arg_stat(ps[2], &test_flag) != 1) return -1;

 /* we return an array with all the dct's, (64, nx*ny/64) */
 dim[0] = 64;	dim[1] = nx*ny/64;
 switch (mode) {
 case 1:
 result_sym = array_scratch(3, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct_array = (float *) ((char *)h + sizeof(struct ahead));
 /* call the routine that does the work */
 fdct_fp(image, nx, ny, qtable, dct_array);
 break;
 case 2:
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct_intarray = (short *) ((char *)h + sizeof(struct ahead));
 /* call the routine that does the work */
 fdct_int(image, nx, ny, qtable, dct_intarray);
 break;
 case 3:
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct_intarray = (short *) ((char *)h + sizeof(struct ahead));
 /* call the routine that does the work */
 fdct_exp(image, nx, ny, qtable, dct_intarray);
 break;
 case 4:
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct_intarray = (short *) ((char *)h + sizeof(struct ahead));
 /* call the routine that does the work */
 fdct_exp2(image, nx, ny, qtable, dct_intarray,test_flag);
 break;
/* #ifdef __alpha */
 case 5:
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct_intarray = (short *) ((char *)h + sizeof(struct ahead));
 /* call the routine that does the work */
 fdct_exp3(image, nx, ny, qtable, dct_intarray,test_flag);
 break;
 case 6:
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct_intarray = (short *) ((char *)h + sizeof(struct ahead));
 /* call the routine that does the work */
 fdct_exp4(image, nx, ny, qtable, dct_intarray,test_flag);
 break;
 case 7:
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct_intarray = (short *) ((char *)h + sizeof(struct ahead));
 /* call the routine that does the work */
 fdct_exp5(image, nx, ny, qtable, dct_intarray,test_flag);
 break;
/* #endif */
 }
 return result_sym;
 }						/*end of ana_trace_fpdct */
 /*------------------------------------------------------------------------- */
int ana_trace_rdct(narg,ps)	/* function for restoring image from dct */
 /* image = ana_trace_rdct(dct, nx, qtable) */
 int	narg, ps[];
 {
 short	*image, *qtable;
 float	*dct_array;
 int	nd, nx, ny, i, j, n, dim[8], iq, result_sym;
 struct	ahead	*h;
 /* first arg must be a F*4 array */
 iq = ps[0];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 3 )  iq = ana_float(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dct_array = (float *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 n = 1; for (j=0;j<nd;j++) n *= h->dims[j]; /*# of elements for nsym */
 /* second arg is number of columns in result array */
 if (int_arg_stat(ps[1], &nx) != 1) return -1;
 /* some restrictions, n%64 must = 0, and (n/nx)%8 must = 0 */
 if ( n%64 != 0 || (n/nx)%8 != 0) {
  printf("dct dimensions invalid, n, nx = %d %d\n",n, nx);
  return 0;
 }
 ny = n/nx;
 /* third arg must be a I*2 array of length 64  */
 iq = ps[2];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 1 )  return execute_error(111);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 qtable = (short *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 n = 1; for (j=0;j<nd;j++) n *= h->dims[j]; /*# of elements for nsym */
 if ( n != 64) {
 	printf("quantization table must be 64 long, n = %d\n", n);
	return -1;
 }
 /* we return a short array (nx, ny) */
 dim[0] = nx;	dim[1] = ny;
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 image = (short*) ((char *)h + sizeof(struct ahead));
 ana_zero(1, &result_sym);
 /* call the routine that does the work */
 rdct_fp(image, nx, ny, qtable, dct_array);
 return result_sym;
 }						/*end of ana_trace_fpdct */
 /*------------------------------------------------------------------------- */
int ana_huff_table_vert(narg,ps)	/* test routine for tables */
 int	narg, ps[];
 {
 /* expect 4 args, first 2 input */
 /* huff_table_vert, bits, values, ehufco, ehufsi */
 int	nd, nx, ny, dim[8], iq, maxval;
 short	*ehufco, *ehufsi;
 struct	ahead	*h;
 byte	*bits, *huffval, *p;
 /* first arg must be a byte array of length 16 */
 /*printf("in ana_huff_table_vert\n");*/
 iq = ps[0];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 bits = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1 || nx != 16) {
 	printf("HUFF_TABLE_VERT, bits must be a vector of 16\n");
	return -1; }
 /* now the values, the length of this determines the sizes of the outputs */
 iq = ps[1];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 huffval = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1) {
 	printf("HUFF_TABLE_VERT, values must be a vector\n");
	return -1; }
 /* create the output arrays, since these are indexed by value, we need to
 find the max value to get sizes */
 maxval = 0;  iq = nx;  p = huffval;
 while(iq--) { maxval = MAX(maxval, (int) *p);	p++; }
 maxval++;
 /* printf("maxval in huff_table_vert = %d\n", maxval); */
 nd = 1;
 dim[0] = maxval;
 if (redef_array(ps[2], 1, nd, dim) != 1 || redef_array(ps[3], 1, nd, dim) != 1)
	 return -1;
 h = (struct ahead *) sym[ps[2]].spec.array.ptr;
 ehufco = (short *) ((char *)h + sizeof(struct ahead));
 bzero(  (char *) ehufco, maxval * sizeof(short) );
 h = (struct ahead *) sym[ps[3]].spec.array.ptr;
 ehufsi = (short *) ((char *)h + sizeof(struct ahead));
 bzero(  (char *) ehufsi, maxval * sizeof(short) );
 /* make the useful tables */
 iq = huff_table_vert(bits, huffval, ehufco, ehufsi, nx);
 /* printf("return status from huff_table_vert = %d\n", iq); */
 if (iq != 1) printf("error in huff_table_vert\n");
 return iq;
 }
 /*------------------------------------------------------------------------- */
int ana_huff_table_gen(narg,ps)		/* test routine for tables */
 int	narg, ps[];
 /* calls huff_table_gen(freqin, nf, bits, huffval, nval) */
 {
 /* expect 3 args, first is input, others are computed here */
 int	nd, nx, ny, dim[8], iq, *freq, nval, j;
 struct	ahead	*h, *h2;
 byte	*bits, *huffval;
 /* first arg is a histogram (or frequency distribution) */
 /* we make it an I*4 array */
 iq = ps[0];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 2 )  iq = ana_long(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 freq = (int *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nx = 1; for (j=0;j<nd;j++) nx *= h->dims[j]; 	/* # of elements */
 /* accept multi-dimensional histograms (16x16 common) but output vector
 array of same (or smaller) size */
 if (nx > 256) {
 	printf("HUFF_TABLE_GEN, histogram must be <= 256 long\n");
	return -1; }
 /* create the output arrays */
 /* redefine the second arg to be a byte array 32 long */
 iq = ps[1];
 dim[0] = 32;
 nd = 1;
 if (redef_array(iq, 0, nd, dim) != 1) return -1;
 h2 = (struct ahead *) sym[iq].spec.array.ptr;
 bits = (byte *) ((char *)h2 + sizeof(struct ahead));
 iq = ps[2];
 dim[0] = nx;
 nd = 1;
 if (redef_array(iq, 0, nd, dim) != 1) return -1;
 h = (struct ahead *) sym[iq].spec.array.ptr;
 huffval = (byte *) ((char *)h + sizeof(struct ahead));

 /* done with setup, call huff_table_gen to do the work */
 iq = huff_table_gen(freq, nx, bits, huffval, &nval);
 if (iq != 1) return iq;
 /* reduce the size of bits to 16 via buggering the dimension */
 h2->dims[0] = 16;
 /* now bugger the dimension of huffval if nval is smaller than nx */
 /* note that we still have the header for huffval in h */
 if (nval < nx) h->dims[0] = nval;
 return 1;
 }
 /*------------------------------------------------------------------------- */
int huff_table_gen(freqin, nf, bits, huffval, nval)
 unsigned char bits[], huffval[];
 int freqin[], *nval, nf;
 /* nf (<=256) should be the size of freqin; bits must be of length 32,
 the freqin counts are assumed to represent values starting
 with 0 and incrementing by 1, huffval will be a sorted set of values
 that had non-zero frequencies, the length will be output in nval and
 will be <= nf */

 {
 int	freq[257], codesize[257], others[257], *pq, *pf;
 int c1, c2;
 int i, j, v, n;
 byte	*p;
 /* this code adapted from the Independent JPEG code */
 n = 32;	p = bits;	while (n--) *p++ = 0;	/* zero bits */
 n = nf+1;	pq = codesize;	while (n--) *pq++ = 0;	/* zero codesize */
 n = nf+1;	pq = others;	while (n--) *pq++ = -1;	/* init others */
 /* make a local copy of freqin and add a ringer to the end to avoid a symbol
 with all 1's */
 n = nf;	pq = freq;	pf = freqin;	
 while (n--) *pq++ = *pf++;
 *pq = 1;	/* this is the ringer */
 
 for (;;) {
    /* Find the smallest nonzero frequency, set c1 = its symbol */
    /* In case of ties, take the larger symbol number */
    c1 = -1;
    v = 1000000000;
    for (i = 0; i <= nf; i++) {
      if (freq[i] && freq[i] <= v) {
	v = freq[i];
	c1 = i;
      }
    }

    /* Find the next smallest nonzero frequency, set c2 = its symbol */
    /* In case of ties, take the larger symbol number */
    c2 = -1;
    v = 1000000000;
    for (i = 0; i <= nf; i++) {
      if (freq[i] && freq[i] <= v && i != c1) {
	v = freq[i];
	c2 = i;
      }
    }

    /* Done if we've merged everything into one frequency */
    if (c2 < 0)
      break;
    
    /* Else merge the two counts/trees */
    freq[c1] += freq[c2];
    freq[c2] = 0;

    /* Increment the codesize of everything in c1's tree branch */
    codesize[c1]++;
    while (others[c1] >= 0) {
      c1 = others[c1];
      codesize[c1]++;
    }
    
    others[c1] = c2;		/* chain c2 onto c1's tree branch */
    
    /* Increment the codesize of everything in c2's tree branch */
    codesize[c2]++;
    while (others[c2] >= 0) {
      c2 = others[c2];
      codesize[c2]++;
    }
  }

 /* Now count the number of symbols of each code length */
 for (i = 0; i <= nf; i++) {
    if (codesize[i]) {
      /* The JPEG standard seems to think that this can't happen, */
      /* but I'm paranoid... */
      if (codesize[i] > 32) {
	printf("huff_table_gen, huffman code > 32 bits long! (%d)\n",
		codesize[i]);
		return -1; }
      bits[codesize[i]-1]++;
    }
  }
  /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure
   * Huffman procedure assigned any such lengths, we must adjust the coding.
   * Here is what the JPEG spec says about how this next bit works:
   * Since symbols are paired for the longest Huffman code, the symbols are
   * removed from this length category two at a time.  The prefix for the pair
   * (which is one bit shorter) is allocated to one of the pair; then,
   * skipping the BITS entry for that prefix length, a code word from the next
   * shortest nonzero BITS entry is converted into a prefix for two code words
   * one bit longer.
   */
  
  for (i = 31; i > 15; i--) {
    while (bits[i] > 0) {
      /* printf("caught one at i = %d, bits[i] = %d\n", i, bits[i]); */
      j = i - 2;		/* find length of new prefix to be used */
      while (bits[j] == 0)
	j--;
      
      bits[i] -= 2;		/* remove two symbols */
      bits[i-1]++;		/* one goes in this length */
      bits[j+1] += 2;		/* two new symbols in this length */
      bits[j]--;		/* symbol of this length is now a prefix */
    }
  }

  /* Remove the count for the pseudo-symbol 256 from the largest codelength */
  while (bits[i] == 0)		/* find largest codelength still in use */
    i--;
  bits[i]--;

 /* Return final symbol counts (only for lengths 0..16) */
  
  /* Return a list of the symbols sorted by code length */
  /* It's not real clear to me why we don't need to consider the codelength
   * changes made above, but the JPEG spec seems to think this works.
   */
  p = huffval;
  for (i = 1; i <= 32; i++) {
    for (j = 0; j <= nf -1; j++) if (codesize[j] == i) *p++ = (byte) j;
  }
  *nval = p - huffval;		/* return number of meaningful values */
  return 1;
 }
 /*------------------------------------------------------------------------- */
/* EMITCODE macros */
 /* code that inserts up to 16 bits in the output buffer, used for
 a single Huffman code in 2 places, the "inputs" are
 code and sq */
#define EMITCODE \
 i=r1>>3;	j=r1%8;		/* our byte and bit addresses */	\
 px = x + i;								\
 y = (int) (code & mask[sq]);						\
 r1 = r1 + sq;		/* future r1 (after this entry) */		\
 if ( r1 > limit ) return dct_buffer_err(limit, size);			\
 if ( j == 0 ) {	/* this is the lucky case, 1 in 8, less work */	\
   /* check for spillover to next bytes (only 1 in this case)*/		\
   /* messy but works on little or big endians */			\
   ks = sq - 8;								\
   if ( ks > 0 ) { *px++ = (byte) (y >> ks);  ks = ks - 8; }		\
   *px = (byte) (y << (-ks) );						\
 } else {		/* more common, need to shift before merging */	\
   ks = sq + j - 8;							\
   if ( ks > 0 ) { *px |= (y >> ks); ks = ks - 8; px++;			\
     if ( ks > 0 ) { *px++ = (byte) ( y >> ks); ks = ks - 8; }		\
   *px = (byte) (y << (-ks) );						\
   } else *px |= (byte) (y << (-ks) );					\
 }
 /* code  that inserts up to 31 bits in the output buffer, used for
 the dc and ac coefficients, the "inputs" are code, sq, nbits, and temp2 */
#define EMITCODE2 \
 i=r1>>3;	j=r1%8;		/* our byte and bit addresses */	\
 px = x + i;								\
 z = (int) (temp2 & mask[nbits]);					\
 y = (int) (code & mask[sq]);						\
 y = y << nbits;							\
 y = y | z;				/* entire dc or ac result */	\
 size = sq + nbits;							\
 r1 = r1 + size;		/* future r1 (after this entry) */	\
 if ( r1 > limit ) return dct_buffer_err(limit, size);			\
									\
 if ( j == 0 ) {	/* this is the lucky case, 1 in 8, less work */	\
   /* check for spillover to next bytes */				\
   /* messy but works on little or big endians */			\
   ks = size - 8;							\
   if ( ks > 0 ) { *px++ = (byte) (y >> ks);  ks = ks - 8;		\
     if ( ks > 0 ) { *px++ = (byte) (y >> ks);  ks = ks - 8;		\
       if ( ks > 0 ) { *px++ = (byte) (y >> ks);  ks = ks - 8;}}}	\
   *px = (byte) (y << (-ks) );						\
 } else {		/* more common, need to shift before merging */	\
   /* up to 5 bytes (including the portion of a previous one) */	\
   ks = size + j - 8;							\
   if ( ks > 0 ) { *px |= (y >> ks); ks = ks - 8; px++;			\
     if ( ks > 0 ) { *px++ = (byte) ( y >> ks); ks = ks - 8;		\
       if ( ks > 0 ) { *px++ = (byte) ( y >> ks); ks = ks - 8;		\
	 if ( ks > 0 ) { *px++ = (byte) (y >> ks);  ks = ks - 8;}}}	\
     *px = (byte) (y << (-ks) );					\
   } else *px |= (byte) (y << (-ks) );					\
 }
 /*------------------------------------------------------------------------- */
int dct_buffer_err(limit,size)
 int limit, size;
 {
 printf("Huffman compression exceeded buffer space, limit =%d\n", limit);
 return -1;
 }
 /*------------------------------------------------------------------------- */
int dct_buffer_decode_err(limit)
 int limit;
 {
 printf("Exceeded data size during Huffman decompression , limit =%d\n", limit);
 return -1;
 }
 /*------------------------------------------------------------------------- */
int ana_huff_encode_dct(narg,ps)	/* test routine for encoding dct's */
 /* this is an ana function, it returns the # of bytes required for the
 compression */
 int	narg, ps[];
 /* calls huff_encode_dct */
 {
 /* expect 6 args
 n = huff_encode_dct(dct, buffer, dc_ehufco, dc_ehufsi, ac_ehufco, ac_ehufsi)*/

 int	nd, nx, ny, dim[8], iq, *freq, nval, n, lenx, i, j, limit, nblocks;
 int	result_sym, blimit;
 struct	ahead	*h;
 unsigned short *dc_ehufco, *dc_ehufsi, *ac_ehufco, *ac_ehufsi;
 short last_dc = 0, *dct;
 byte	*xbuffer, *tbuffer;
 byte *px, *pt;
 /* first arg is the dct array, usually a (64,nblocks) array but
 we accept any thing that is a multiple of 64 */
 /* we make it an I*2 array */
 iq = ps[0];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 1 )  iq = ana_word(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dct = (short *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 n = 1; for (j=0;j<nd;j++) n *= h->dims[j]; 	/* # of elements */
 if ( (n%64) != 0 || n == 0) {
 	printf("HUFF_ENCODE_DCT, dct array must be 0 mod 64\n");
	return -1; }
 nblocks = n/64;
 /* the second arg is a buffer to hold the result, don't worry about data type,
 but must be an array. We figure out total bytes available for limit */
 /* this will be the buffer for the result but we also use another buffer
 to hold the result before checking for accidental markers (0xff's), the
 marker removal is done by replacing each ff with an ff followed by a zero
 which causes a slight expansion of the data. This could be implemented
 by checking within the Huffman coding but we allocate a second buffer and
 do it afterwards, transferring every byte from a temp. buffer to the final
 one. The temp buffer is freed before any other mallocs so it shouldn't
 cause frag. problems at least. */
 iq = ps[1];
 if ( sym[iq].class != 4 ) return execute_error(66);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 nd = h->ndim;
 limit = ana_type_size[sym[iq].type];
 for (j=0;j<nd;j++) limit *= h->dims[j]; 		/* # of bytes */
 xbuffer = (byte *) ((char *)h + sizeof(struct ahead));
 blimit = limit * 8;	/* make a bit limit also */
 printf("limit/blimit = %d, %d\n", limit, blimit);
 /* now malloc a temp buffer of same size */
 tbuffer = (byte *) malloc(limit);
 bzero( tbuffer, limit);

 /* arg 3 is the dc Huffman code table, must be 16 long */
 iq = ps[2];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 1 )  iq = ana_word(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1 || nx != 16) {
   printf("HUFF_ENCODE_DCT, dc_ehufco array must be 16 long, it is %d\n",nx);
   return -1; }
 dc_ehufco = (unsigned short *) ((char *)h + sizeof(struct ahead));

 /* arg 4 is the dc Huffman length table, must also be 16 long */
 iq = ps[3];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1 || nx != 16) {
 	printf("HUFF_ENCODE_DCT, dc_ehufsi array must be 16 long\n");
	return -1; }
 dc_ehufsi = (unsigned short *) ((char *)h + sizeof(struct ahead));

 /* arg 5 is the ac Huffman code table, must be 256 long */
 iq = ps[4];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 1 )  iq = ana_word(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 nd = h->ndim;
 n = 1; for (j=0;j<nd;j++) n *= h->dims[j]; /*# of elements for nsym */
 if (n != 256) {
 	printf("HUFF_ENCODE_DCT, ac_ehufco array must be 256 long\n");
	return -1; }
 ac_ehufco = (unsigned short *) ((char *)h + sizeof(struct ahead));

 /* arg 6 is the ac Huffman length table, must also be 256 long */
 iq = ps[5];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 nd = h->ndim;
 n = 1; for (j=0;j<nd;j++) n *= h->dims[j]; /*# of elements for nsym */
 if (n != 256) {
 	printf("HUFF_ENCODE_DCT, ac_ehufsi array must be 256 long\n");
	return -1; }
 ac_ehufsi = (unsigned short *) ((char *)h + sizeof(struct ahead));

 /* at present, we always have last_dc set to 0 before this call */
 lenx = huff_encode_dct(dct, nblocks, tbuffer, blimit, dc_ehufco, dc_ehufsi,
 	ac_ehufco, ac_ehufsi, last_dc);
 if (lenx < 0) { free(tbuffer);  return lenx; }
 /* now transfer to xbuffer and zero pad any accidental markers */
 n = i = lenx;
 px = xbuffer;	pt = tbuffer;
 while (n--) {
 if ( (*px++ = *pt++) == 0xff ) { *px++ = 0; lenx++;
 	if (lenx > limit) { /* trouble */
	free(tbuffer);
	printf("huff_encode_dct, exceeded buffer limit during padding\n");
	return -1; }}
 }
 /* printf("padded %d markers\n", lenx - i); */
 free(tbuffer);
 /* put the byte count into a scalar and return with it */
 result_sym = scalar_scratch(2);
 sym[result_sym].spec.scalar.l = lenx;
 return result_sym;
 }
 /*------------------------------------------------------------------------- */
int huff_encode_dct(dct, nblocks, x, limit, dc_ehufco, dc_ehufsi, ac_ehufco,
	ac_ehufsi, last_dc)
 /* takes a 12 bit DCT array organized as a 64 x nblocks array and puts
 the Huffman encoded result in a buffer x. The 64 coefficients for each
 block are in zig/zag order. The total buffer size available is limit which
 is only occassionally checked so there should be enough margin for a
 worse case coefficient (about 32 bits?) to avoid possible clobbering.
 The Huffman codes and their lengths for the dc terms are in dc_ehufco
 and dc_ehufsi (this is JPEGese) and the ac codes/lengths are in
 ac_ehufco and ac_ehufsi. These are tables which are 16 long for the dc
 case and 256 for the ac (although 14 are not used for the ac). Tables
 for 8 bit JPEG would be a bit shorter. The last_dc is just 0 if this
 is an entire image, could be the previous dc value if this routine is
 used to encode just part of an image. An attempt has been made to use
 16 bit or 8 bit quantities (except for an accumulator) to make the
 relationship with an IP code more likely. But there are still many
 32 bit counters in the code at present. */
 /* returns # of bytes used in x */
 short	dct[], last_dc;		/* short is 16 bits */
 unsigned short dc_ehufco[], ac_ehufco[];
 byte	x[], dc_ehufsi[], ac_ehufsi[];
 int	limit, nblocks;		/* but could be short */
 {
 int	i, j, k, idct, r1, size, iq, lenx;
 int	ks;
 short	temp, temp2, nbits, rs, sq, code;
 unsigned int	y, z;
 unsigned short mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff,
  0x1ff,0x3ff,0x7ff,0xfff,0x1fff,0x3fff,0x7fff,0xffff};
 byte *px;
 px = x;	/* where we put the results */
 r1 = 0;	/* a bit count */
 
 while (nblocks--)	{	/* loop over input blocks */
 /* output the dc and ac components of a block */
 /* parts modified from the jpeg-5a code */
 /* the dct array contains the coeffs. in zig/zag order, we write
 to a buffer x[] which has to be big enough (we check), x is not
 pre-zeroed which accounts for some of the "j==0"  testing done */

 /* Encode the DC coefficient difference per section F.1.2.1 */
 /* this involves doing a first difference and then then finding the
 nagnitude range of the result (i.e., log2). There are 16 ranges which
 use a Huffman code tabe, the remaining bits (up to 15) then follow the
 Huffman code for a total of up to 31 bits for a dc term */
 
 temp = temp2 = dct[0] - last_dc;
 last_dc = dct[0];

 /* note that dct is bumped by 64 at end of loop */

 if (temp < 0) { temp = -temp;	temp2--; }

 /* temp is abs value of input */
 /* For a negative input, want temp2 = bitwise complement of abs(input) */
 /* This code assumes we are on a two's complement machine */
 /* Find the number of bits needed for the magnitude of the coefficient */
 nbits = 0;
 while (temp) { nbits++;  temp >>= 1; } /* this zaps temp, we use temp2 */

 /* the number of bits (nbits) will be 0 to 14 for the dc term with 12 JPEG,
 a Huffman code for nbits is inserted into the bit stream followed by
 temp2. The Huffman code can be up to 16 bits long and temp2 can be up to
 15 bits for a total of 31 bits. We could emit these out separately but
 instead combine them first in a 32 bit word. */
 
 sq = dc_ehufsi[nbits];	code = dc_ehufco[nbits]; 	/* size and code */
 if (sq == 0) return huff_encode_err(nbits); 
 EMITCODE2	/* a macro, used here and for ac terms */

 /* now up to 63 ac terms, dtc[1 to 63] */
 /* Encode the AC coefficients per section F.1.2.2 */
  
 rs = 0;			/* rs = run length of zeros */
 for (idct = 1; idct < 64; idct++) { /* change back to 64 */
  if ((temp =dct[idct]) == 0) rs++;  else {
    while (rs > 15) {
    /* if run length > 15, must emit special run-length-16 codes (0xF0),
    this is usually a long code, typically 10 or 11 bits, fairly rare */
    sq = ac_ehufsi[0xF0];	code = ac_ehufco[0xF0]; /* size and code */
    if (sq == 0) return huff_encode_err(nbits); 
    EMITCODE		/* defined above, good for 16 bits or less */
    rs -= 16;	}

    /* as with the dc term, get temp2 and nbits */
    temp2 = temp;
    if (temp < 0) { temp = -temp;	temp2--; }
      
    nbits = 1;		/* there must be at least one 1 bit for the ac's */
    while ((temp >>= 1)) nbits++;
    
    /* the JPEG symbol is 8 bits using 4 bits for the run length and 4 for
    the magnitude range of the next non-zero ac term, this 8 bit symbol is
    Huffman coded and we now emit the Huffman code plus the extra bits */
      
    iq = (rs << 4) + nbits;
    sq = ac_ehufsi[iq];	code = ac_ehufco[iq]; 	/* size and code */
    if (sq == 0) return huff_encode_err(iq); 
    EMITCODE2	/* a macro, used here and for dc term */     

    rs = 0;
    }
  }

 /* If the last coef(s) were zero, emit an end-of-block code (#0) */
 if (rs > 0) {
    sq = ac_ehufsi[0];	code = ac_ehufco[0]; 	/* size and code */
    if (sq == 0) return huff_encode_err(nbits); 
    EMITCODE		/* defined above, good for 16 bits or less */
    }
  dct = dct +64;	/* point to next block */
 }
 lenx = px - x + 1;
 if ( (r1%8) != 0 ) lenx++;
 return lenx;
 }
 /*------------------------------------------------------------------------- */
int huff_decode_err(n)
 int	n;
 {
 printf("Huffman decoding error # %d\n", n);
 return -1; 
 }
 /*------------------------------------------------------------------------- */
int huff_encode_err(n)
 int	n;
 {
 printf("illegal symbol, no Huffman code for value %d\n", n);
 return -1; 
 }
 /*------------------------------------------------------------------------- */
int ana_jpeg12_scan(narg,ps)	/* test routine for decoding epic dct's */
 /* an ana function, returns dct array */
 int	narg, ps[];
 /* calls huff_decode_dct */
 {
 /* expect 7 args
 dct = jpeg12_scan(nblocks, x, dc_bits, dc_huffval, ac_bits,
 	ac_huffval, restart_interval)
 note that x is modified	
 */

 int	nd, nx, ny, dim[8], iq, *freq, n, lenx, i, j, limit, nblocks;
 int	nval_ac, nval_dc, blimit, restart_interval, icount;
 int	result_sym, nb, n_restarts_found, code, iblock, stat, n_unexpected;
 struct	ahead	*h;
 byte *dc_bits, *dc_huffval, *ac_bits, *ac_huffval;
 short last_dc = 0, *dct;
 byte	*xbuffer, *px, *pt, *pb;

 /* first arg is nblocks */
 if (int_arg_stat(ps[0], &nblocks) != 1) return -1;
 if (int_arg_stat(ps[6], &restart_interval) != 1) return -1;
 /* setup the dct array, a (64,nblocks) I*2 array */
 dim[0] = 64;	dim[1] = nblocks;
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct = (short *) ((char *)h + sizeof(struct ahead));

 /* the second arg is the encoded data, don't worry about data type,
 but must be an array. We figure out total bytes available for limit */
 
 iq = ps[1];
 if ( sym[iq].class != 4 ) return execute_error(66);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 nd = h->ndim;
 limit = ana_type_size[sym[iq].type];
 for (j=0;j<nd;j++) limit *= h->dims[j]; 		/* # of bytes */
 xbuffer = (byte *) ((char *)h + sizeof(struct ahead));
 
 /* arg 3 is the dc bit length table, must be 16 long */
 iq = ps[2];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dc_bits = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1 || nx != 16) {
 	printf("JPEG12_SCAN, bits must be a vector of 16\n");
	return -1; }
 /* arg 4, now the dc values, the size determines nval_dc */
 iq = ps[3];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dc_huffval = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nval_dc = h->dims[0];
 if (nd != 1) {
 	printf("JPEG12_SCAN, values must be a vector\n");
	return -1; }

 /* arg 5 is the ac bit length table, must be 16 long */
 iq = ps[4];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 ac_bits = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1 || nx != 16) {
 	printf("JPEG12_SCAN, bits must be a vector of 16\n");
	return -1; }
 /* arg 6, now the ac values, the size determines nval_ac */
 iq = ps[5];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 ac_huffval = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nval_ac = h->dims[0];
 if (nd != 1) {
 	printf("JPEG12_SCAN, values must be a vector\n");
	return -1; }

 /* scan for padded markers and messages, decode each restart section after
 we finish the scan of it */
 px = xbuffer;
 pt = pb = px;
 iblock = 0;
 icount = 0;
 n = 0;		/* n will be the bytes found between restarts */
 /* nblocks is the total for the whole image, nb is the # for each restart
 interval, if that is 0 then nb = nblocks (i.e., no restarts) */
 if (restart_interval) nb = restart_interval; else nb = nblocks;
 printf("nb = %d\n", nb);
 for (;;) {	/* scan until we hit limit or decode nblocks or error */
 if ( (*pt++ = *px++) == 0xff) {	/* a message or pad? */
  code = *px++ & 0x00ff;
  if (code == 0)
    /* just a padded ff, ff in stream already, just bump n */
     n++; else {
   if (code == 0xff) {
     /* this is 2 ff's */
     *pt++ = 0xff;	n +=2;	} else {
     if ( (code == 0xd9) || ((code & 0xf8) == 0xd0) )
     {
     printf("message code = %#x, icount = %d\n", code, icount);
     --pt;
     n_restarts_found += 1;
     /* either the end of image or a restart, process what we have */
     /* we assume that we only process a number of blocks equal to the
     restart_interval or whatever is left in image, note that normal
     TRACE images will always have an integer number of restart
     intervals */
     if ( (nblocks - iblock) < nb ) nb = nblocks - iblock;
     if (nb > 0) {
     //printf("decoding, n, nb = %d, %d at %d, iblock = %d, ic = %d\n",
     //	n, nb, pb, iblock,icount);
     icount++;
     stat = huff_decode_dct(&dct[iblock*64], nb, pb, n, dc_bits, dc_huffval,
     	last_dc, ac_bits, ac_huffval, nval_dc, nval_ac);
     if (stat != 1) printf("error at iblock = %d\n", iblock);
     } else {
       /* got a restart or EOI and no data left, OK if EOI, in any
       case we have to stop */
       if (code == 0xd9) { printf("got the EOI\n"); } else {
       	  printf("abnormal end of image, beware!\n"); }
       break;
     }
 
     /* if the code was the end of image, we should be done */
     n = 0;	pb = pt = px;	iblock +=  nb; 
     if (code == 0xd9) { printf("got the EOI\n"); break; }
 
     } else {
     printf("unexpected message %#x at iblock = %d\n", code, iblock);
     n_unexpected += 1;
     /* check ahead to see if there is a restart message next */
     printf("%#x  %#x  %#x  %#x\n", *px, *(px+1), *(px+2), *(px+3) );
     /* treat as data and see if we choke */
     *pt++ = code;  n += 2;
     /*return scan_err(1);*/
     }
     } }
     /* all of the cases with a code use up 2 bytes, so */
     limit = limit -2;
  } else { limit--;  n ++; } /* if not a ff, only one byte consumed */

 if ( limit <= 0) { printf("jpeg12_scan - limit error\n");
 	printf("px-xbuffer = %d, n = %d\n", px-xbuffer, n);
 	printf("pt-xbuffer = %d, n = %d\n", pt-xbuffer, n);
 	break;}
  }

 return result_sym;
 }
 /*------------------------------------------------------------------------- */
int ana_huff_decode_dct(narg,ps)	/* test routine for decoding dct's */
 /* an ana function, returns dct array */
 int	narg, ps[];
 /* calls huff_decode_dct */
 {
 /* expect 6 args
 optional 7th added 5/29/96, pflag set to 0 disables padding removal
 dct = huff_decode_dct(nblocks, x, dc_bits, dc_huffval, ac_bits,
 	ac_huffval, [pflag]) */

 int	nd, nx, ny, dim[8], iq, *freq, n, lenx, i, j, limit, nblocks;
 int	nval_ac, nval_dc, blimit, pflag=1;
 int	result_sym;
 struct	ahead	*h;
 byte *dc_bits, *dc_huffval, *ac_bits, *ac_huffval;
 short last_dc = 0, *dct;
 byte	*xbuffer, *px, *pt;

 /* first arg is nblocks */
 if (int_arg_stat(ps[0], &nblocks) != 1) return -1;
 if (narg > 6) { if (int_arg_stat(ps[6], &pflag) != 1) return -1; }
 /* setup the dct array, a (64,nblocks) I*2 array */
 dim[0] = 64;	dim[1] = nblocks;
 result_sym = array_scratch(1, 2, dim);
 h = (struct ahead *) sym[result_sym].spec.array.ptr;
 dct = (short *) ((char *)h + sizeof(struct ahead));

 /* the second arg is the encoded data, don't worry about data type,
 but must be an array. We figure out total bytes available for limit */
 /* we also do a pass to remove zero pads after false markers, this
 modifies the buffer array */
 iq = ps[1];
 if ( sym[iq].class != 4 ) return execute_error(66);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 nd = h->ndim;
 limit = ana_type_size[sym[iq].type];
 for (j=0;j<nd;j++) limit *= h->dims[j]; 		/* # of bytes */
 xbuffer = (byte *) ((char *)h + sizeof(struct ahead));
 /* scan for padded markers and remove the zero pads, if markers
 without zero pads are found, emit a warning */
 if (pflag == 1) {
 n = i = limit;
 px = xbuffer;
 while (n--) {
 /* skip to first ff */
 if (*px++ == 0xff) break;
 }
 /* found the first one, back up one */
 px--;	n++;
 pt = px;
 while ( (n--) > 0 ) {
   if ( (*pt++ = *px++) == 0xff) {
     if (*px++ != 0 ) {		/* expected a zero?, mention this */
       printf("huff_decode_dct, unpadded marker encountered, beware\n");
       px--;	i = px - xbuffer;
       printf("mark %#x followed by %#x, at i = %d\n", *(px-1), *px, i);
       *pt++ = *px;	px++;	/* pass it on but probably hopeless */
       } else { limit--; n--;}
 }}
 /* printf("removed %d pads from encoded data\n", i - limit); */
 } /* end of pad removal section */
 
 /* arg 3 is the dc bit length table, must be 16 long */
 iq = ps[2];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dc_bits = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1 || nx != 16) {
 	printf("HUFF_DECODE_DCT, bits must be a vector of 16\n");
	return -1; }
 /* arg 4, now the dc values, the size determines nval_dc */
 iq = ps[3];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dc_huffval = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nval_dc = h->dims[0];
 if (nd != 1) {
 	printf("HUFF_DECODE_DCT, values must be a vector\n");
	return -1; }

 /* arg 5 is the ac bit length table, must be 16 long */
 iq = ps[4];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 ac_bits = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nx = h->dims[0];
 if (nd != 1 || nx != 16) {
 	printf("HUFF_DECODE_DCT, bits must be a vector of 16\n");
	return -1; }
 /* arg 6, now the ac values, the size determines nval_ac */
 iq = ps[5];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 0 )  iq = ana_byte(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 ac_huffval = (byte *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 nval_ac = h->dims[0];
 if (nd != 1) {
 	printf("HUFF_DECODE_DCT, values must be a vector\n");
	return -1; }

 huff_decode_dct(dct, nblocks, xbuffer, limit, dc_bits, dc_huffval, last_dc,
	ac_bits, ac_huffval, nval_dc, nval_ac);
 return result_sym;
 }
 /*------------------------------------------------------------------------- */
int huff_decode_dct(dct, nblocks, x, limit, dc_bits, dc_huffval, last_dc,
	ac_bits, ac_huffval)
 /* returns decoded dct, the coded data is in x, huffman tables as in
 huff_encode_dct */
 short	dct[], last_dc;
 byte	x[];
 int	limit, nblocks;
 byte dc_bits[], dc_huffval[], ac_bits[], ac_huffval[];
 {
 int	i, j, sq, code, k, idct, r1, size, iq, lenx, lastp, n, nval;
 int	jq, result, look, ntable, lookbits, nb, lbits, ks, iblock;
 unsigned char huffsize[256], *p, *bits, *huffval;
 unsigned short	huffcode[256], *pc;
 int	dc_mincode[16], dc_maxcode[16], dc_valptr[16];
 int	ac_mincode[16], ac_maxcode[16], ac_valptr[16];
 int	*mincode, *maxcode, *valptr;
 byte	dc_look_nbits[256],dc_look_sym[256],ac_look_nbits[256],ac_look_sym[256];
 byte	*look_nbits, *look_sym;
 unsigned int	y, z, temp, temp2, nbits, rs;
 unsigned int mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff,
  0x1ff,0x3ff,0x7ff,0xfff,0x1fff,0x3fff,0x7fff,0xffff};
 byte *px;

 /* use bits to generate the Huffman codes and sizes in code-length order */
 /* for both dc and ac tables */
 ntable = 2;
 bits = dc_bits;	valptr = dc_valptr;	mincode = dc_mincode;
 maxcode = dc_maxcode;	look_nbits = dc_look_nbits;
 look_sym = dc_look_sym;	huffval = dc_huffval;
 while (ntable--) {
 p = huffsize;
 pc = huffcode;
 code = 0;
 for (k = 0; k < 16; k++) {
   j = k + 1;	n = (int) bits[k];	/* n is the number of this length */
   while (n--)  {			/* the codes of a common length */
   	*p++ = (unsigned char) (j);	/* just increase by 1 wrt previous */
	*pc++ = code++;	  }
  code <<= 1;				/* for new code size, left shift */
  }
  *p = 0;
  lastp = p - huffsize;

 /* Figure F.15: generate decoding tables for bit-sequential decoding */
 iq = 0;
 for (k = 0; k < 16; k++) {
   if (bits[k]) {
     valptr[k] = iq; 	/* huffval[] index of 1st symbol of code length k+1 */
     mincode[k] = huffcode[iq]; /* minimum code of length k+1 */
     iq += bits[k];
     maxcode[k] = huffcode[iq-1]; /* maximum code of length k+1 */
   } else {
     maxcode[k] = -1;	/* -1 if no codes of this length */
   }
 }

 /* generate lookup tables to speed up the process for Huffman codes of
 length 8 or less, one set for dc and ac, follows technique in jpeg-5a but
 not the same code so some variables may look similar but beware ! */
 
 bzero(look_nbits, 256);
 iq = 0;
 for (k = 0; k < 8; k++) {
  for (i = 0; i < (int) bits[k]; i++, iq++) {
  /* k+1 = current code's length, iq = its index in huffcode[] & huffval[]. */
  /* Generate left-justified code followed by all possible bit sequences */
    lookbits = huffcode[iq] << (7 - k);
    jq = k+1;
    for (j= 1 << (7 - k);j > 0;j--) {
      look_nbits[lookbits] = jq;
      look_sym[lookbits] = huffval[iq];
      lookbits++;
    }
  }
 } 
 bits = ac_bits;	valptr = ac_valptr;	mincode = ac_mincode;
 maxcode = ac_maxcode;	look_nbits = ac_look_nbits;
 look_sym = ac_look_sym;	huffval = ac_huffval;
 }

 px = x;	/* where we get the Huffman coded version */
 r1 = 0;	/* a bit count */

 /*printf("in huff_decode_dct, mark 1 =========================\n");*/
 /* decode the dc and ac components of a block */
 /* parts modified from the jpeg-5a code */

 /* iblock = 0; */
 while (nblocks--) {
 /*
 printf("iblock = %d\n", iblock);
 iblock++;
 */
 bzero(dct, 128);	/* pre-zero this block */
 /* start dc decode, grab 8 bits and use lookup shortcut to see if we
 got a short Huffman code */
 
 i=r1>>3;	j=r1%8;		/* our byte and bit addresses */
 if (i > limit) return dct_buffer_decode_err(limit);
 /*printf("start dc decode, r1, i, j = %d %d %d ------------\n", r1, i,j);*/
 px = x + i;
 if (j == 0) {	/* aligned on byte, lucky 1/8 */
  look = *px;
 } else {	/* need part of next byte as well */
  jq = 8 - j;	/* available bits in px */
  look = *px++ << j;	/* msb, make room for lsb */
  look |= ((int) *px) >> jq;
  look = look & 0xff;
 }
 /*printf("look = %#x\n", look);*/
 if ((nb = dc_look_nbits[look]) != 0) {
     nbits = dc_look_sym[look];
     lbits = 8 -nb;
     r1 += nb;
     /*printf("successful lookup, size = %d, value(nbits) = %d\n", nb, nbits);*/
 } else {
 /*printf("slow decode\n");*/
     /* get here if the code is longer than 8 bits or some kind of disaster */
     r1 += 8;	/* point after look */
     i=r1>>3;	j=r1%8;		px = x + i;
     /* get 16 bits in temp */
     temp = ((int) look) << 8;	/* look will be top 8 */
     if (j == 0) { temp = temp | ( (int) *px );
     } else {
     /* need 2 transfers, 8 bits total */
     jq = 8 - j;	/* available bits in px */
     temp |= ( *px++ << j );
     temp |= ( ((int) *px) >> jq);
     }
     /*printf("16 bit candidate in slow decode = %#x\n", temp);*/
     k = 7;	nb = 8;		/* here nb is code size - 1 */
     while ( (ks =(temp >> k)) > dc_maxcode[nb] ) { k--; nb++;
     	if (k < 0)  return huff_decode_err(0); }
     /* note error return if we couldn't find a code */
     nbits = dc_huffval[ dc_valptr[nb] + ks - dc_mincode[nb] ];
 /*printf("ks=%d, nb=%d, dc_maxcode[nb]=%d, dc_mincode[nb]=%d,dc_valptr[nb]=%d\n",
      ks,nb,dc_maxcode[nb],dc_maxcode[nb],dc_valptr[nb]);*/
     nb++;	/* actual size of code */
     /*printf("size of code from slow_decode = %d\n", nb);*/
     r1 += (nb -8);
     lbits = 0;
 }
 /*printf("nbits = %#x\n", nbits);*/
 /* that defines the dc range interval, now get the rest of the bits */
 /* if nbits is 0, don't need any */
 if (nbits) {
 /* we have some still in look, lbits */
 /* if so, nb is valid, so use it to shift old look */
 /*printf("lbits = %d\n", lbits);*/
 if (lbits >= nbits) {  temp = (look>> (lbits-nbits)) & mask[nbits];
 } else {
 /* harder, need "nbits" # of bits */
 i=r1>>3;	j=r1%8;		px = x + i;
 /*printf("start appended bits, r1, i, j = %d %d %d\n", r1, i,j);*/
 jq = 8 - j;
 temp = ( *px  & mask[jq] );	/* gets us jq ls bits */
 ks = nbits - jq;		/* how much more needed (could be < 0) */
 /*printf("jq, ks, temp =%d %d %#x\n", jq, ks, temp);*/
 if (ks>0) { temp = temp << ks;	/* shift over to make room */
  ks -=8;	temp2 = ((int) *++px);
 /*printf("second byte, ks, temp2 = %d %#x\n", ks, temp2);*/
  if (ks>0) { temp |= ( temp2 << ks );	/* need a third ? */
    ks -=8;	temp2 = ((int) *++px);
 /*printf("third byte, ks, temp2 = %d %#x\n", ks, temp2);*/
    }
 temp |= ( temp2 >> (-ks) );		/* last, #2 or #3 */
 } else { temp = temp >> (-ks); }
 }
 r1 += nbits;	/* count after this is done */
 /* now extend the sign, uses the jpeg-5a macro defined above */
 /* we use the lookup table version */
 /*printf("temp = %#x ", temp);*/
 nbits = huff_EXTEND(temp, nbits);
 /*printf("extended = %#x\n", nbits);*/
 }
 dct[0] = last_dc = nbits + last_dc;
 /*printf("dc = %d, %#x\n", dct[0], dct[0]);*/
 /* wraps the dc term, start the ac */

 /*printf("start ac decode, r1, i, j = %d %d %d ***************\n", r1, i,j);*/
 for (idct = 1; idct < 64; idct++) {
 /*printf("start ac term %d\n", idct);*/
 i=r1>>3;	j=r1%8;		px = x + i;
 if (i > limit) return dct_buffer_decode_err(limit);
 /* decode the Huffman symbol, use ac tables */
 if (j == 0) {	/* aligned on byte, lucky 1/8 */
  look = *px;
 } else {	/* need part of next byte as well */
  jq = 8 - j;	/* available bits in px */
  look = *px++ << j;	/* msb, make room for lsb */
  look |= ((int) *px) >> jq;
  look = look & 0xff;
 }
 /*printf("look = %#x\n", look);*/
 if ((nb = ac_look_nbits[look]) != 0) {
     nbits = ac_look_sym[look];
     lbits = 8 -nb;
     r1 += nb;
     /*printf("successful lookup, size = %d, value(nbits) = %d\n", nb, nbits);*/
 } else {
 /*printf("slow decode\n");*/
     /* get here if the code is longer than 8 bits or some kind of disaster */
     r1 += 8;	/* point after look */
     i=r1>>3;	j=r1%8;		px = x + i;
     /* get 16 bits in temp */
     temp = ((int) look) << 8;	/* look will be top 8 */
     if (j == 0) { temp = temp | ( (int) *px );
     } else {
     /* need 2 transfers, 8 bits total */
     jq = 8 - j;	/* available bits in px */
     temp |= ( *px++ << j );
     temp |= ( ((int) *px) >> jq);
     }
     /*printf("16 bit candidate in slow decode = %#x\n", temp);*/
     k = 7;	nb = 8;		/* here nb is code size - 1 */
     while ( (ks =(temp >> k)) > ac_maxcode[nb] ) { k--; nb++;
     	if (k < 0)  return huff_decode_err(1); }
     /* note error return if we couldn't find a code */
     nbits = ac_huffval[ ac_valptr[nb] + ks - ac_mincode[nb] ];
 /*printf("ks=%d, nb=%d, ac_maxcode[nb]=%d, ac_mincode[nb]=%d,ac_valptr[nb]=%d\n",
      ks,nb,ac_maxcode[nb],ac_maxcode[nb],ac_valptr[nb]);*/
     nb++;	/* actual size of code */
     /*printf("size of code from slow_decode = %d\n", nb);*/
     r1 += (nb -8);
     lbits = 0;
 }
 /*printf("nbits = %#x\n", nbits);*/
 /* that defines the ac symbol, contains a zero run length and bits in
 next non-zero coeff. */
 rs = nbits >> 4;	/* run length */
 nbits = nbits & 0xf;	/* bits in next one, unless 0, then a special code */
 /*printf("rs = %d, nbits = %d\n", rs, nbits);*/
 if (nbits == 0) {
   if (rs != 15) break;		/* hit the end, rest are all zips */
   idct += 15;
 } else {
 idct += rs;			/* skip over the zeroes */
 /* need the next nbits bits */
 /* we have some still in look, lbits */
 /* if so, nb is valid, so use it to shift old look */
 /*printf("lbits = %d\n", lbits);*/
 if (lbits >= nbits) {  temp = (look>> (lbits-nbits)) & mask[nbits];
 } else {
 /* harder, need "nbits" # of bits */
 i=r1>>3;	j=r1%8;		px = x + i;
 /*printf("start appended bits, r1, i, j = %d %d %d\n", r1, i,j);*/
 jq = 8 - j;
 temp = ( *px  & mask[jq] );	/* gets us jq ls bits */
 ks = nbits - jq;		/* how much more needed (could be < 0) */
 /*printf("jq, ks, temp =%d %d %#x\n", jq, ks, temp);*/
 if (ks>0) { temp = temp << ks;	/* shift over to make room */
  ks -=8;	temp2 = ((int) *++px);
 /*printf("second byte, ks, temp2 = %d %#x\n", ks, temp2);*/
  if (ks>0) { temp |= ( temp2 << ks );	/* need a third ? */
    ks -=8;	temp2 = ((int) *++px);
 /*printf("third byte, ks, temp2 = %d %#x\n", ks, temp2);*/
    }
 temp |= ( temp2 >> (-ks) );		/* last, #2 or #3 */
 } else { temp = temp >> (-ks); }
 }
 r1 += nbits;	/* count after this is done */
 /* now extend the sign, uses the jpeg-5a macro defined above */
 /* we use the lookup table version */
 /*printf("temp = %#x ", temp);*/
 nbits = huff_EXTEND(temp, nbits);
 /*printf("extended = %#x\n", nbits);*/
 /* this nbits is now the ac term */
 dct[idct] = (short) nbits;
 /*printf("ac term = %d\n", dct[idct]);*/
 }
 }	/* end of idct loop for ac terms */
 dct += 64;
 }	/* end of nblocks loop */
 return 1;
 }
 /*------------------------------------------------------------------------- */
int ana_dcthist(narg,ps)	/* test routine for encoding dct's */
 /* computes histograms for the dc and ac symbols that get Huffman coded */
 int	narg, ps[];
 /* calls huff_encode_dct */
 {
 /* expect 3 args
 dcthist, dct, dcbins, acbins
 the first is input, the others are output */

 int	nd, nx, ny, dim[8], iq, *freq, nval, n, lenx, i, j, nblocks;
 int	*dcbins, *acbins, rs, temp, temp2, nbits, idct;
 struct	ahead	*h;
 short last_dc = 0, *dct;
 byte	*xbuffer, *tbuffer;
 byte *px, *pt;
 /* first arg is the dct array, usually a (64,nblocks) array but
 we accept any thing that is a multiple of 64 */
 /* we make it an I*2 array */
 iq = ps[0];
 if ( sym[iq].class != 4 ) return execute_error(66);
 if ( sym[iq].type != 1 )  iq = ana_word(1, &iq);
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dct = (short *) ((char *)h + sizeof(struct ahead));
 nd = h->ndim;
 n = 1; for (j=0;j<nd;j++) n *= h->dims[j]; 	/* # of elements */
 if ( (n%64) != 0 || n == 0) {
 	printf("DCTHIST, dct array must be 0 mod 64\n");
	return -1; }
 nblocks = n/64;

 /* create the output arrays */
 /* redefine the second arg to be a long array 16 long */
 iq = ps[1];
 dim[0] = 16;
 nd = 1;
 if (redef_array(iq, 2, nd, dim) != 1) return -1;
 h = (struct ahead *) sym[iq].spec.array.ptr;
 dcbins = (int *) ((char *)h + sizeof(struct ahead));
 /* redefine the third arg to be a long array 16 x 16 */
 iq = ps[2];
 dim[0] = 16; dim[1] = 16;
 nd = 2;
 if (redef_array(iq, 2, nd, dim) != 1) return -1;
 h = (struct ahead *) sym[iq].spec.array.ptr;
 acbins = (int *) ((char *)h + sizeof(struct ahead));

 /* zero these */
 bzero(dcbins, 16*4);	bzero(acbins, 256*4);
  
 /* now run through the blocks, compute the strange values, and accumulate
 the histograms */
 
 n = nblocks;
 while (n--) {
 temp = temp2 = dct[0] - last_dc;
 last_dc = dct[0];
 if (temp < 0) { temp = -temp;	temp2--; }
 nbits = 0;
 while (temp) { nbits++;  temp >>= 1; } /* this zaps temp, we use temp2 */
 dcbins[nbits] += 1;
 /* ac terms */ 
 rs = 0;			/* rs = run length of zeros */
 for (idct = 1; idct < 64; idct++) { /* change back to 64 */
  if ((temp =dct[idct]) == 0) rs++;  else {
    while (rs > 15) { acbins[0xf0] += 1;	rs -= 16;	}
     temp2 = temp;
    if (temp < 0) { temp = -temp;	temp2--; }
    nbits = 1;
    while ((temp >>= 1)) nbits++;
    iq = (rs << 4) + nbits;
    acbins[iq] += 1;
    rs = 0;
    }
 }
 /* If the last coef(s) were zero, emit an end-of-block code (#0) */
 if (rs > 0) {  acbins[0] += 1; }
 dct = dct +64;	/* point to next block */
 }
 return 1;
 }
 /*------------------------------------------------------------------------- */