/home/knoppix/=/muldemos/NUSS.C

/*
     NUSS.C
*/
/*
** This file is placed into the public domain by its author,
** Carey Bloodworth (Carey@Bloodworth.org) on July 16, 2001
**
** This multiplication demo is not designed for high performance.
** It's a tutorial program designed to be used with the information
** on my web site at www.Bloodworth.org
*/
/*
** This file demonstrates a very basic Nussbaumer convolution.
**
** To compile this using GCC:
** gcc main.c nuss.c -o nuss.exe
*/
/*
** There are a lot of ways to arrange things.  (All use DiF/DiT pairs.)
** The code shown here is pretty much the 'official' style that Nussbaumer
** & Knuth use.  The exception is that I've extracted the 'twist' code
** and made them into more readable functions, and I added some code
** to allow calling a faster FFT mul when things get small enough.  (The
** fft mul has to be able to handle our signed data, of course.)
**
** This sample code is *VERY* crude.  It'd need a lot of work before it
** would become even vaguely practical.
*/
#include 
#include 
#include 

typedef unsigned long int UINT32;
typedef unsigned long long int UINT64;
typedef long int INT32;
typedef long long int INT64;
typedef short int Short;

/*
** Threshold where it's faster to use some other method to multiply
** data of lengths this and smaller.
**
** If you don't want a threshold (ie: you want to recurse all the way
** down), just set it to zero.  It will, however, cost you a great
** deal of time.
**
** See note in SimpleNegaCyclicMul()
*/
/* FFT isn't included in this example program.
#define CYCLIC_THRESHOLD     1024
#define NEGACYCLIC_THRESHOLD 1024
#define ALLOW_FFTNEGMUL      1
*/
#define CYCLIC_THRESHOLD     0
#define NEGACYCLIC_THRESHOLD 0


typedef float  S_DIGIT;
typedef double D_DIGIT;
/*
** Both data types need to be signed.
**
** Single needs to be big enough to hold BASE*Len.  In other words
** Log2(Len) extra bits.
**
** Double needs to twice as big.  It has to hold the product of two
** Singles, including 2*Log2(Len) extra bits.
**
** I'm using a float & double here, but it can be any data type.  It
** can even be some multi-precision data type you code yourself.
*/

/*
** I'm sort of pretending that I have some special data type
** for Nussbaumer data elements.  I don't, of course, but
** this helps keep the two data widths seperate.
**
** There are still a few places in the code where I access the
** data directly, but those areas are pretty obvious.
*/
#define N_Copy(_a,_b)      _a=_b
#define N_Setv(_a,_b)      _a=_b
#define N_Zero(_a)         _a=0
#define N_Getv(_a)         _a
#define N_Add(_s,_a,_b)    _s=_a+_b
#define N_Sub(_d,_a,_b)    _d=_a-_b
#define N_MulDSS(_p,_a,_b) {_p=((D_DIGIT)_a)*((D_DIGIT)_b);}
#define N_Neg(_a)          _a=-_a

#define N_AddDDD(_s,_a,_b) _s=_a+_b
#define N_SubDDD(_d,_a,_b) _d=_a-_b
/*#define N_Div2(_b,_a)    _b=((_a)>>1)*/
#define N_Div2(_b,_a)    _b=((_a)/2)
#define N_ZeroD(_a)        _a=0
#define N_SetvD(_a,_b)     _a=_b
#define N_GetvD(_a)        _a
#define N_NegD(_a)         _a=-_a
#define N_CopyD(_a,_b)     _a=_b

static S_DIGIT *NUSSNum1=NULL, *NUSSNum2=NULL;
static D_DIGIT *NUSSProd=NULL;
static double BASE;
static int BASE_DIG;


int Log2(UINT32 Num)
{UINT32 x=1;
 int y=-1;
while (x <= Num)
  {
   x*=2;
   y++;
  }
return y;
}

#if 0
void DumpMatrix(char *str,S_DIGIT *data,int Height, int Width)
{int x,y;
if (str) printf("%s\n",str);
for (y=0;y NEGACYCLIC_THRESHOLD)
  {
   printf("hmmm... SimpleNegaCyclicMul() called with a length larger than expected.\n");
  }
/* Faster to do a FFT?? */

#ifdef ALLOW_FFTNEGMUL
if (Len >= 32)
  {
    FFTNegMul(Prod,Data1,Data2,Len);
    return;
  }
#endif

/*
** Gotta be careful about this because at the higher levels it's
** quite possible to get only muls that are smaller than threshold.
*/
/*
** I was playing with a special 'counting' version of this program and
** I was trying various threshold sizes and so on.  When I went up
** fairly high, I noticed cases where the entire Mul count was just
** the schoolboy below and no FFT mul!  Obviously, in a real program
** that's NOT good!!
**
** Because of the way Nussbaumer splits up the data, this routine can
** be called with a wide variety of sizes, so we need a wide margin
** to decide whether to call the FFT or some other method.
**
** In my tests, I was just using a threshold of 1k-8k with NussLen's
** of up to 256m.  (Not really realistic.) I discovered I needed to do
** various divisors to make sure I got all the FFT mul's I was entitled.
**
** 1024   /16 =  64
** 2048   /32 =  64
** 4096   /32 = 128
** 8192   /64 = 128
**
** In other words, it appears the MAXIMUM fft threshold must be no
** more than the power of two above the square root of the main
** threshold.  ie: sqrt(1024)=32, means max fft threshold would be 64.
** Or, to put it another way, do a FFT from 64 to 1024 lengths.
**
** Again, this was done with extreme cases, but it is something
** you need to remember.
*/
for (p=0;p= q) N_AddDDD(Prod[p],Temp,Prod[p]);
      else        N_SubDDD(Prod[p],Prod[p],Temp);
     }
  }
}


/*
** These twists are the 'mul by root' routines.  It just happens
** to work out that we can do all this by shifting and negating.
** Now, this isn't how Knuth & Nussbaumer describe it.  They do
** it as part of the butterfly and doing it on a bias, putting the
** result into temp storage, and then copying it back into the
** correct location.
**
** Doing it this way makes the code a bit clearer.  Looks more
** like a regular FFT.  Slow, though.
*/
void Twist(S_DIGIT *Data,int Len, int Expon)
{S_DIGIT *Temp = malloc(sizeof(S_DIGIT)*Len);
 int x;
if (Temp==NULL) {printf("Can't allocate for Twist.\n");exit(0);}
if (Expon > Len) {printf("Twist: Expon=%d Len=%d\n",Expon,Len);exit(0);}

memcpy(Temp,Data,Len*sizeof(S_DIGIT));
memcpy(Data,Temp+(Len-Expon),Expon*sizeof(S_DIGIT));
memcpy(Data+Expon,Temp,(Len-Expon)*sizeof(S_DIGIT));
for (x=0;x<(Len-Expon);x++) N_Neg(Data[Expon+x]); /* negate */

free(Temp);
}

void TwistD(D_DIGIT *Data,int Len, int Expon)
/* Twist for the double precision output data */
/* Note that it's a different direction, too. */
{D_DIGIT *Temp = malloc(sizeof(D_DIGIT)*Len);
 int x;
if (Temp==NULL) {printf("Can't allocate for TwistD.\n");exit(0);}
if (Expon > Len) {printf("TwistD: Expon=%d Len=%d\n",Expon,Len);exit(0);}

memcpy(Temp,Data,Len*sizeof(D_DIGIT));
memcpy(Data,Temp+(Expon),(Len-Expon)*sizeof(D_DIGIT));
memcpy(Data+(Len-Expon),Temp,Expon*sizeof(D_DIGIT));
for (x=0;x<(Expon);x++) N_NegD(Data[Len-Expon+x]);

free(Temp);
}


/*
** The Nussbaumer FFTs are actually pretty standard FFTs.
**
** The differences are
** 1: They use their own special root.
** 2: Because of #1, the multiplication by the root is a 'twist'
** 3: The transforms are column based while the twist is row based.
**
** Also worth noting is that Knuth & Nussbaumer do the 'division by
** the power of two' normalization during the inverse transform.
** Most transforms wait until after it's done, and so could these,
** but I figured it was worth leaving it in this way.
**
** The transforms themselves are pretty much a regular transform, so that
** means you can use about any style you want.
**
** Normally the transforms are done vertically and the twist is done
** horizontally.  It's possible to do it the other way, which gets rid
** of the matrix transposition.
**
** It's also possible to do the transforms recursively and do it
** without the twists.  Everything is 'on the bias'.  There is a
** lot of overhead, though.
*/

static UINT32
BitReversal(UINT32 Mask, UINT32 Val)
{
  UINT32 R = 0;

  if (Val==0) return 0;

  do
    {
      R *= 2;
      R += (Val & 1);
      Mask /= 2;
      Val /= 2;
    }
  while (Mask > 1);

  return R;
}

#define DATA(y,x) Data[((y)*Width)+(x)]
/* DATA[Col,Row] ie: DATA[Height,Width] */

void
FwdNussFFT(S_DIGIT *Data,int Height,int Width)
/* DiF Nussbaumer FFT without scrambling w/ BitReversal roots */
{UINT32 x,Step,Chunk,n;

  for (Step=Height; Step > 1; Step/=2)
    { UINT32 HalfStep=Step/2;
      UINT32 Chunks = Height/Step;
      UINT32 Root = (Step*Width)/Height;
      for (Chunk = 0; Chunk < Chunks; Chunk++)
        {
          UINT32 CurRoot = BitReversal(Chunks, Chunk) * Root;
          for (n = 0; n < HalfStep; n++)
            {
              UINT32 Left = Chunk*Step+n;
              UINT32 Right = Left + HalfStep;

              Twist(&DATA(Right,0),Width,CurRoot);

              for (x = 0; x < Width; x++)
                {S_DIGIT L,R;
                 N_Copy(L,DATA(Left,x));
                 N_Copy(R,DATA(Right,x));
                 N_Sub(DATA(Left,x),L,R);
                 N_Add(DATA(Right,x),L,R);
                }
            }
        }
    }
}

void
RevNussFFT(D_DIGIT *Data,int Height,int Width)
/* DiT Nussbaumer FFT without scrambling w/ BitReversal roots */
{UINT32 x,Step,Chunk,n;

  for (Step = 2; Step <= Height; Step *=2)
    { UINT32 HalfStep=Step/2;
      UINT32 Chunks = Height/Step;
      UINT32 Root = (Step*Width)/Height;
      for (Chunk = 0; Chunk < Chunks; Chunk++)
        {
          UINT32 Expon = BitReversal(Chunks, Chunk) * Root;
          for (n = 0; n < HalfStep; n++)
            {
              UINT32 Left = Chunk*Step+n;
              UINT32 Right = Left + HalfStep;
              for (x = 0; x < Width; x++)
                {D_DIGIT L,R,T;
                 N_CopyD(L,DATA(Left,x));
                 N_CopyD(R,DATA(Right,x));
                 N_AddDDD(T,L,R);N_Div2(DATA(Left,x),T);
                 N_SubDDD(T,L,R);N_Div2(DATA(Right,x),T);
                }
              TwistD(&DATA(Right,0),Width,Expon);
            }
        }
    }
}
#undef DATA


void
NegaCyclicNussMul(D_DIGIT *Result, S_DIGIT *Data1, S_DIGIT *Data2, UINT32 NussLen)
{
  UINT32 Height,Width;
  S_DIGIT *TData1,*TData2;
  D_DIGIT *TResult;
  UINT32 x,y;

  /* Small enough we can do it faster some other way? Just example. */
#if (NEGACYCLIC_THRESHOLD > 0)

  if (NussLen <= NEGACYCLIC_THRESHOLD)
    {SimpleNegaCyclicMul(Result,Data1,Data2,NussLen);return;}

#else

/* If we don't want a threshold, we need to hardwire a couple of sizes. */
  if (NussLen == 1)
    { /* Plain single digit mul. */
      N_MulDSS(Result[0], Data1[0], Data2[0]);
      return;
    }
  if (NussLen == 2)
    {/* Double digit mul.  Do it like a 'complex' mul cause it's faster.
     ** Result[0] = Data1[0]*(Data2[0]+Data2[1]) - Data2[1]*(Data1[0]+Data1[1])
     ** Result[1] = Data1[0]*(Data2[0]+Data2[1]) + Data2[0]*(Data1[1]-Data1[0])
     */
     S_DIGIT Sum1,Sum2;
     D_DIGIT Prod1,Prod2;
     N_Add(Sum1, Data1[0], Data1[1]);
     N_Add(Sum2, Data2[0], Data2[1]);
     N_MulDSS(Prod1, Data1[0], Sum2);
     N_MulDSS(Prod2, Sum1, Data2[1]);
     N_SubDDD(Result[0], Prod1, Prod2);
     N_Sub(Sum1, Data1[1], Data1[0]);
     N_MulDSS(Prod2, Sum1, Data2[0]);
     N_AddDDD(Result[1], Prod1, Prod2);
     return;
    }
/* We could, of course, go further.... */
#endif


  Height = (UINT32) 1 << (Log2(NussLen) / 2); /* Pow2(log2(sqrt(Len))) */
  Width = NussLen / Height;
  /* Width >= Height before the zero padding. */
  /* Width <= Height after the zero padding. */

/*
** If we want to, we can increase the width and decrease the height
** Just make sure height is at least 2 or it wont reduce and
** we'd end up in an infinite loop.  With zero padding later,
** the height will be at least 4.
**
** The FFT length can't be larger than the twist length is.
**
** Note: This can be a *LOT* slower, esp. if you push it to far.
**
** For example...
Height=2;Width=NussLen/Height;
*/

  TData1 = malloc(sizeof(S_DIGIT)* 2*NussLen);
  TData2 = malloc(sizeof(S_DIGIT)* 2*NussLen);
  TResult = malloc(sizeof(D_DIGIT)* 2*NussLen);

/* Make our data look like a matrix */
#define DATA1(i,j)  TData1[((i)*Width)+(j)]
#define DATA2(i,j)  TData2[((i)*Width)+(j)]
#define RESULT(i,j) TResult[((i)*Width)+(j)]
/* [Col,Row] ie: [Height,Width] */

  if ((TData1==NULL) || (TData2==NULL) || (TResult==NULL))
    {printf("Unable to allocate workspace.\n");exit(0);}

/*
** Transpose the rows & columns and then zero pad.
** This makes the data vertical, rather than horizontal.
** The zero padding is added onto the right.
*/
  for (x=0;x<2*NussLen;x++)
    {
     N_Zero(TData1[x]);
     N_Zero(TData2[x]);
     N_ZeroD(TResult[x]);
    }

for (y = 0; y < Height; y++)
   for (x = 0; x < Width; x++)
     {
      N_Copy(DATA1(y, x), Data1[(x*Height)+y]);
      N_Copy(DATA2(y, x), Data2[(x*Height)+y]);
     }


/* The height has now been doubled */
Height*=2;

FwdNussFFT(TData1,Height,Width);
FwdNussFFT(TData2,Height,Width);

/*
** Compute the negacyclic product of each row.
** We do this by recursing, but really we could call any negacylcic
** multiplication routine.
*/
for (y = 0; y < Height; y++)
   NegaCyclicNussMul(&RESULT(y,0), &DATA1(y,0), &DATA2(y,0), Width);

RevNussFFT(TResult,Height,Width);

Height/=2;

/* Make the result negacyclic and compact. */
  for (y = 0; y < Height; y++)
    {
      N_SubDDD(Result[y], RESULT(y, 0), RESULT(y+Height, Width-1));
      for (x = 1; x < Width; x++)
        N_AddDDD(Result[(x*Height) + y], RESULT(y, x), RESULT(y+Height, x-1));
    }

#undef DATA1
#undef DATA2
#undef RESULT
  free(TData1);free(TData2);free(TResult);
}


void SimpleCyclicMul(D_DIGIT *Prod, S_DIGIT *Data1, S_DIGIT *Data2, UINT32 Len)
/* Plain old n^2 schoolboy for example purposes */
{UINT32 p,i,q;
 D_DIGIT Temp;

if (Len > CYCLIC_THRESHOLD)
  {
   printf("hmmm... SimpleCyclicMul() called with a length larger than expected.\n");
  }
/* We could do a FFT here. */
for (p=0;p 0)

  if (Len <= CYCLIC_THRESHOLD)
    {SimpleCyclicMul(Prod,Data1,Data2,Len);return;}

#else
/* If we don't want to do a threshold, hardwire a couple of sizes. */
  if (Len == 1)
    {/* Basic single digit multiply. */
     N_MulDSS(Prod[0],Data1[0], Data2[0]);
     return;
    }
  if (Len == 2)
    { /* Double digit mul. */
/*
** Prod[0] := ((Data1[0]+Data1[1])*(Data2[0]+Data2[1]) +
**             (Data1[0]-Data1[1])*(Data2[0]-Data2[1])) / 2
** Prod[1] := ((Data1[0]+Data1[1])*(Data2[0]+Data2[1]) -
**             (Data1[0]-Data1[1])*(Data2[0]-Data2[1])) / 2
*/
      S_DIGIT Sum1,Sum2,Dif1,Dif2;
      D_DIGIT Prod1, Prod2;
      N_Add(Sum1, Data1[0], Data1[1]);
      N_Sub(Dif1, Data1[0], Data1[1]);
      N_Add(Sum2, Data2[0], Data2[1]);
      N_Sub(Dif2, Data2[0], Data2[1]);
      N_MulDSS(Prod1, Sum1, Sum2);
      N_MulDSS(Prod2, Dif1, Dif2);
      N_AddDDD(Prod[0], Prod1, Prod2);
      N_Div2(Prod[0], Prod[0]);
      N_SubDDD(Prod[1], Prod1, Prod2);
      N_Div2(Prod[1], Prod[1]);
      return;
    }
#endif

/* Break the data into the two half sized parts. */
  for (x = 0; x < HLen; x++)
    { S_DIGIT Tmp;
      N_Sub(Tmp, Data1[x], Data1[x + HLen]);
      N_Add(Data1[x],Data1[x], Data1[x + HLen]);
      N_Copy(Data1[x+HLen],Tmp);
    }
  if (Data1 != Data2) /* Don't do it if we are squaring */
    for (x = 0; x < HLen; x++)
      { S_DIGIT Tmp;
        N_Sub(Tmp, Data2[x], Data2[x + HLen]);
        N_Add(Data2[x], Data2[x], Data2[x + HLen]);
        N_Copy(Data2[x+HLen],Tmp);
      }
/*
** At the very first call to this function, when the data is zero
** padded, at this point, we are now calling Cyclic and NegaCyclic
** without zero padding.  Same data, just no zero padding.
*/

  CyclicNussMul(&Prod[0], &Data1[0], &Data2[0], HLen);

  NegaCyclicNussMul(&Prod[HLen], &Data1[HLen], &Data2[HLen], HLen);

/*
** Now do the Chinese Remainder Theorem and put he answers together.
*/
  for (x = 0; x < HLen; x++)
    {D_DIGIT Sum,Diff;
     N_AddDDD(Sum, Prod[x], Prod[x + HLen]);
     N_SubDDD(Diff, Prod[x], Prod[x + HLen]);
     N_Div2(Prod[x], Sum);
     N_Div2(Prod[x + HLen], Diff);
    }
}

#define CalcNUSSLen(_NumLen) ((_NumLen)*2)

void
FFTMul(Short * Prod, Short * Num1, Short * Num2,int Len)
/*
** Do a Nussbaumer convolution.
** Data is big-endian
*/
{
  int x, NUSSLen = CalcNUSSLen(Len);
  double Carry, Pyramid;

  for (x = 0; x < NUSSLen; x++) NUSSNum1[x]=NUSSNum2[x]=0;
  for (x = 0; x < Len; x++)     NUSSNum1[x]=Num1[Len-x-1];
  for (x = 0; x < Len; x++)     NUSSNum2[x]=Num2[Len-x-1];
/* Put our data into FFT in little endian format & zero pad */

/*
** Do the convolution... Take your pick which function to use!
** The cyclic version requires fewer MUL ops, but it has higher
** overhead.  The NegaCyclic version (ie: Nussbaumer itself)
** requires a few more operations but has less overhead.
**
NegaCyclicNussMul(NUSSProd,NUSSNum1,NUSSNum2,NUSSLen);
*/
CyclicNussMul(NUSSProd,NUSSNum1,NUSSNum2,NUSSLen);

/*
** Now release our carries, and store the results in the 'prod' array.
*/
  Carry = 0.0;
  for (x = NUSSLen; x > 0; x--)
    {
      Pyramid = NUSSProd[NUSSLen - x] + Carry;
/* No need to round Pyramid, like with a FFT.  It's already an integer */
      Carry = floor(Pyramid / BASE);
/* floor() is a very slow function on the x86/x87 */
      Prod[x - 1] = Pyramid - Carry * BASE;
    }
}

void
InitFFT(unsigned long int Len,int Base,int BaseDig)
{int Bytes;

BASE=Base;
BASE_DIG=BaseDig;

Bytes=sizeof(S_DIGIT)*CalcNUSSLen(Len);

if (BaseDig > 4)
  {
   printf("Error:  The fft is slightly hardwired for <= 4 digits in the base.\n");
   exit(0);
  }

if ((BaseDig == 4) && (Len >= 131072))
  {
   printf("Warning: Due to the data type sizes in this example program, Nussbaumer\n");
   printf("convolution fails when the base is >= 10,000 and the length is >= 65,536.\n");
  }

NUSSNum1=(S_DIGIT*)malloc(Bytes);
NUSSNum2=(S_DIGIT*)malloc(Bytes);
NUSSProd=(D_DIGIT*)malloc(CalcNUSSLen(Len)*sizeof(D_DIGIT));

if ((NUSSNum1==NULL) || (NUSSNum2==NULL) || (NUSSProd==NULL))
   {
    printf("Unable to allocate memory for FFTNum.\n");
    printf("Len=%d Bytes=%d\n",(int)Len,(int)Bytes);
    exit(0);
   }
}

void DeInitFFT(unsigned long int Len)
{
free(NUSSNum1);free(NUSSNum2);free(NUSSProd);
}






#if 0
/* Number of multiplications for Nussbaumer. */

double
CalcCycNussMuls(double Length)
{int L2;
if (Length<=0) return 0;
if (Length==1) return 1;
if (Length==2) return 2;
L2=log(Length/2)/log(2.0)+0.01;
return 3.0*pow(2.0,L2-1+ceil(log(L2)/log(2.0)))+CalcCycNussMuls(Length/2.0);
}

double
CalcNegNussMuls(double Length)
/* Number of simple muls for the negacycle Nussbaumer */
{int L2;
if (Length<=0) return 0;
if (Length==1) return 1;
if (Length==2) return 3;

L2=log(Length)/log(2.0)+0.01;
return 3.0*Length/2*pow(2.0,ceil(log(L2)/log(2.0)));
/* The pow(ceil()) raises to the next square pow of 2. */
}

double
CalcNegNussMuls(int Length)
/* A more natural way to compute it. */
{int L2,Height;
if (Length<=0) return 0;
if (Length==1) return 1;
if (Length==2) return 3;
/* If we don't recurse all the way, you can put a threshold limit here. */

Height = (UINT32) 1 << (Log2(Length) / 2);
/* Height can be hardwired at 4 or larger. */

return Height*2*CalcNegNussMuls(Length/Height);
}
/*
 Len   Neg  Cyc
  2 ==   3    2
  4 ==  12    5
  8 ==  48   17
 16 ==  96   65
 32 == 384  161
 64 == 768  545
128 ==1536 1313
256 ==3072 2849
*/
#endif