/*
NUSS.C
*/
/*
** This file is placed into the public domain by its author,
** Carey Bloodworth (Carey@Bloodworth.org) on July 16, 2001
**
** This multiplication demo is not designed for high performance.
** It's a tutorial program designed to be used with the information
** on my web site at www.Bloodworth.org
*/
/*
** This file demonstrates a very basic Nussbaumer convolution.
**
** To compile this using GCC:
** gcc main.c nuss.c -o nuss.exe
*/
/*
** There are a lot of ways to arrange things. (All use DiF/DiT pairs.)
** The code shown here is pretty much the 'official' style that Nussbaumer
** & Knuth use. The exception is that I've extracted the 'twist' code
** and made them into more readable functions, and I added some code
** to allow calling a faster FFT mul when things get small enough. (The
** fft mul has to be able to handle our signed data, of course.)
**
** This sample code is *VERY* crude. It'd need a lot of work before it
** would become even vaguely practical.
*/
#include
#include
#include
typedef unsigned long int UINT32;
typedef unsigned long long int UINT64;
typedef long int INT32;
typedef long long int INT64;
typedef short int Short;
/*
** Threshold where it's faster to use some other method to multiply
** data of lengths this and smaller.
**
** If you don't want a threshold (ie: you want to recurse all the way
** down), just set it to zero. It will, however, cost you a great
** deal of time.
**
** See note in SimpleNegaCyclicMul()
*/
/* FFT isn't included in this example program.
#define CYCLIC_THRESHOLD 1024
#define NEGACYCLIC_THRESHOLD 1024
#define ALLOW_FFTNEGMUL 1
*/
#define CYCLIC_THRESHOLD 0
#define NEGACYCLIC_THRESHOLD 0
typedef float S_DIGIT;
typedef double D_DIGIT;
/*
** Both data types need to be signed.
**
** Single needs to be big enough to hold BASE*Len. In other words
** Log2(Len) extra bits.
**
** Double needs to twice as big. It has to hold the product of two
** Singles, including 2*Log2(Len) extra bits.
**
** I'm using a float & double here, but it can be any data type. It
** can even be some multi-precision data type you code yourself.
*/
/*
** I'm sort of pretending that I have some special data type
** for Nussbaumer data elements. I don't, of course, but
** this helps keep the two data widths seperate.
**
** There are still a few places in the code where I access the
** data directly, but those areas are pretty obvious.
*/
#define N_Copy(_a,_b) _a=_b
#define N_Setv(_a,_b) _a=_b
#define N_Zero(_a) _a=0
#define N_Getv(_a) _a
#define N_Add(_s,_a,_b) _s=_a+_b
#define N_Sub(_d,_a,_b) _d=_a-_b
#define N_MulDSS(_p,_a,_b) {_p=((D_DIGIT)_a)*((D_DIGIT)_b);}
#define N_Neg(_a) _a=-_a
#define N_AddDDD(_s,_a,_b) _s=_a+_b
#define N_SubDDD(_d,_a,_b) _d=_a-_b
/*#define N_Div2(_b,_a) _b=((_a)>>1)*/
#define N_Div2(_b,_a) _b=((_a)/2)
#define N_ZeroD(_a) _a=0
#define N_SetvD(_a,_b) _a=_b
#define N_GetvD(_a) _a
#define N_NegD(_a) _a=-_a
#define N_CopyD(_a,_b) _a=_b
static S_DIGIT *NUSSNum1=NULL, *NUSSNum2=NULL;
static D_DIGIT *NUSSProd=NULL;
static double BASE;
static int BASE_DIG;
int Log2(UINT32 Num)
{UINT32 x=1;
int y=-1;
while (x <= Num)
{
x*=2;
y++;
}
return y;
}
#if 0
void DumpMatrix(char *str,S_DIGIT *data,int Height, int Width)
{int x,y;
if (str) printf("%s\n",str);
for (y=0;y NEGACYCLIC_THRESHOLD)
{
printf("hmmm... SimpleNegaCyclicMul() called with a length larger than expected.\n");
}
/* Faster to do a FFT?? */
#ifdef ALLOW_FFTNEGMUL
if (Len >= 32)
{
FFTNegMul(Prod,Data1,Data2,Len);
return;
}
#endif
/*
** Gotta be careful about this because at the higher levels it's
** quite possible to get only muls that are smaller than threshold.
*/
/*
** I was playing with a special 'counting' version of this program and
** I was trying various threshold sizes and so on. When I went up
** fairly high, I noticed cases where the entire Mul count was just
** the schoolboy below and no FFT mul! Obviously, in a real program
** that's NOT good!!
**
** Because of the way Nussbaumer splits up the data, this routine can
** be called with a wide variety of sizes, so we need a wide margin
** to decide whether to call the FFT or some other method.
**
** In my tests, I was just using a threshold of 1k-8k with NussLen's
** of up to 256m. (Not really realistic.) I discovered I needed to do
** various divisors to make sure I got all the FFT mul's I was entitled.
**
** 1024 /16 = 64
** 2048 /32 = 64
** 4096 /32 = 128
** 8192 /64 = 128
**
** In other words, it appears the MAXIMUM fft threshold must be no
** more than the power of two above the square root of the main
** threshold. ie: sqrt(1024)=32, means max fft threshold would be 64.
** Or, to put it another way, do a FFT from 64 to 1024 lengths.
**
** Again, this was done with extreme cases, but it is something
** you need to remember.
*/
for (p=0;p= q) N_AddDDD(Prod[p],Temp,Prod[p]);
else N_SubDDD(Prod[p],Prod[p],Temp);
}
}
}
/*
** These twists are the 'mul by root' routines. It just happens
** to work out that we can do all this by shifting and negating.
** Now, this isn't how Knuth & Nussbaumer describe it. They do
** it as part of the butterfly and doing it on a bias, putting the
** result into temp storage, and then copying it back into the
** correct location.
**
** Doing it this way makes the code a bit clearer. Looks more
** like a regular FFT. Slow, though.
*/
void Twist(S_DIGIT *Data,int Len, int Expon)
{S_DIGIT *Temp = malloc(sizeof(S_DIGIT)*Len);
int x;
if (Temp==NULL) {printf("Can't allocate for Twist.\n");exit(0);}
if (Expon > Len) {printf("Twist: Expon=%d Len=%d\n",Expon,Len);exit(0);}
memcpy(Temp,Data,Len*sizeof(S_DIGIT));
memcpy(Data,Temp+(Len-Expon),Expon*sizeof(S_DIGIT));
memcpy(Data+Expon,Temp,(Len-Expon)*sizeof(S_DIGIT));
for (x=0;x<(Len-Expon);x++) N_Neg(Data[Expon+x]); /* negate */
free(Temp);
}
void TwistD(D_DIGIT *Data,int Len, int Expon)
/* Twist for the double precision output data */
/* Note that it's a different direction, too. */
{D_DIGIT *Temp = malloc(sizeof(D_DIGIT)*Len);
int x;
if (Temp==NULL) {printf("Can't allocate for TwistD.\n");exit(0);}
if (Expon > Len) {printf("TwistD: Expon=%d Len=%d\n",Expon,Len);exit(0);}
memcpy(Temp,Data,Len*sizeof(D_DIGIT));
memcpy(Data,Temp+(Expon),(Len-Expon)*sizeof(D_DIGIT));
memcpy(Data+(Len-Expon),Temp,Expon*sizeof(D_DIGIT));
for (x=0;x<(Expon);x++) N_NegD(Data[Len-Expon+x]);
free(Temp);
}
/*
** The Nussbaumer FFTs are actually pretty standard FFTs.
**
** The differences are
** 1: They use their own special root.
** 2: Because of #1, the multiplication by the root is a 'twist'
** 3: The transforms are column based while the twist is row based.
**
** Also worth noting is that Knuth & Nussbaumer do the 'division by
** the power of two' normalization during the inverse transform.
** Most transforms wait until after it's done, and so could these,
** but I figured it was worth leaving it in this way.
**
** The transforms themselves are pretty much a regular transform, so that
** means you can use about any style you want.
**
** Normally the transforms are done vertically and the twist is done
** horizontally. It's possible to do it the other way, which gets rid
** of the matrix transposition.
**
** It's also possible to do the transforms recursively and do it
** without the twists. Everything is 'on the bias'. There is a
** lot of overhead, though.
*/
static UINT32
BitReversal(UINT32 Mask, UINT32 Val)
{
UINT32 R = 0;
if (Val==0) return 0;
do
{
R *= 2;
R += (Val & 1);
Mask /= 2;
Val /= 2;
}
while (Mask > 1);
return R;
}
#define DATA(y,x) Data[((y)*Width)+(x)]
/* DATA[Col,Row] ie: DATA[Height,Width] */
void
FwdNussFFT(S_DIGIT *Data,int Height,int Width)
/* DiF Nussbaumer FFT without scrambling w/ BitReversal roots */
{UINT32 x,Step,Chunk,n;
for (Step=Height; Step > 1; Step/=2)
{ UINT32 HalfStep=Step/2;
UINT32 Chunks = Height/Step;
UINT32 Root = (Step*Width)/Height;
for (Chunk = 0; Chunk < Chunks; Chunk++)
{
UINT32 CurRoot = BitReversal(Chunks, Chunk) * Root;
for (n = 0; n < HalfStep; n++)
{
UINT32 Left = Chunk*Step+n;
UINT32 Right = Left + HalfStep;
Twist(&DATA(Right,0),Width,CurRoot);
for (x = 0; x < Width; x++)
{S_DIGIT L,R;
N_Copy(L,DATA(Left,x));
N_Copy(R,DATA(Right,x));
N_Sub(DATA(Left,x),L,R);
N_Add(DATA(Right,x),L,R);
}
}
}
}
}
void
RevNussFFT(D_DIGIT *Data,int Height,int Width)
/* DiT Nussbaumer FFT without scrambling w/ BitReversal roots */
{UINT32 x,Step,Chunk,n;
for (Step = 2; Step <= Height; Step *=2)
{ UINT32 HalfStep=Step/2;
UINT32 Chunks = Height/Step;
UINT32 Root = (Step*Width)/Height;
for (Chunk = 0; Chunk < Chunks; Chunk++)
{
UINT32 Expon = BitReversal(Chunks, Chunk) * Root;
for (n = 0; n < HalfStep; n++)
{
UINT32 Left = Chunk*Step+n;
UINT32 Right = Left + HalfStep;
for (x = 0; x < Width; x++)
{D_DIGIT L,R,T;
N_CopyD(L,DATA(Left,x));
N_CopyD(R,DATA(Right,x));
N_AddDDD(T,L,R);N_Div2(DATA(Left,x),T);
N_SubDDD(T,L,R);N_Div2(DATA(Right,x),T);
}
TwistD(&DATA(Right,0),Width,Expon);
}
}
}
}
#undef DATA
void
NegaCyclicNussMul(D_DIGIT *Result, S_DIGIT *Data1, S_DIGIT *Data2, UINT32 NussLen)
{
UINT32 Height,Width;
S_DIGIT *TData1,*TData2;
D_DIGIT *TResult;
UINT32 x,y;
/* Small enough we can do it faster some other way? Just example. */
#if (NEGACYCLIC_THRESHOLD > 0)
if (NussLen <= NEGACYCLIC_THRESHOLD)
{SimpleNegaCyclicMul(Result,Data1,Data2,NussLen);return;}
#else
/* If we don't want a threshold, we need to hardwire a couple of sizes. */
if (NussLen == 1)
{ /* Plain single digit mul. */
N_MulDSS(Result[0], Data1[0], Data2[0]);
return;
}
if (NussLen == 2)
{/* Double digit mul. Do it like a 'complex' mul cause it's faster.
** Result[0] = Data1[0]*(Data2[0]+Data2[1]) - Data2[1]*(Data1[0]+Data1[1])
** Result[1] = Data1[0]*(Data2[0]+Data2[1]) + Data2[0]*(Data1[1]-Data1[0])
*/
S_DIGIT Sum1,Sum2;
D_DIGIT Prod1,Prod2;
N_Add(Sum1, Data1[0], Data1[1]);
N_Add(Sum2, Data2[0], Data2[1]);
N_MulDSS(Prod1, Data1[0], Sum2);
N_MulDSS(Prod2, Sum1, Data2[1]);
N_SubDDD(Result[0], Prod1, Prod2);
N_Sub(Sum1, Data1[1], Data1[0]);
N_MulDSS(Prod2, Sum1, Data2[0]);
N_AddDDD(Result[1], Prod1, Prod2);
return;
}
/* We could, of course, go further.... */
#endif
Height = (UINT32) 1 << (Log2(NussLen) / 2); /* Pow2(log2(sqrt(Len))) */
Width = NussLen / Height;
/* Width >= Height before the zero padding. */
/* Width <= Height after the zero padding. */
/*
** If we want to, we can increase the width and decrease the height
** Just make sure height is at least 2 or it wont reduce and
** we'd end up in an infinite loop. With zero padding later,
** the height will be at least 4.
**
** The FFT length can't be larger than the twist length is.
**
** Note: This can be a *LOT* slower, esp. if you push it to far.
**
** For example...
Height=2;Width=NussLen/Height;
*/
TData1 = malloc(sizeof(S_DIGIT)* 2*NussLen);
TData2 = malloc(sizeof(S_DIGIT)* 2*NussLen);
TResult = malloc(sizeof(D_DIGIT)* 2*NussLen);
/* Make our data look like a matrix */
#define DATA1(i,j) TData1[((i)*Width)+(j)]
#define DATA2(i,j) TData2[((i)*Width)+(j)]
#define RESULT(i,j) TResult[((i)*Width)+(j)]
/* [Col,Row] ie: [Height,Width] */
if ((TData1==NULL) || (TData2==NULL) || (TResult==NULL))
{printf("Unable to allocate workspace.\n");exit(0);}
/*
** Transpose the rows & columns and then zero pad.
** This makes the data vertical, rather than horizontal.
** The zero padding is added onto the right.
*/
for (x=0;x<2*NussLen;x++)
{
N_Zero(TData1[x]);
N_Zero(TData2[x]);
N_ZeroD(TResult[x]);
}
for (y = 0; y < Height; y++)
for (x = 0; x < Width; x++)
{
N_Copy(DATA1(y, x), Data1[(x*Height)+y]);
N_Copy(DATA2(y, x), Data2[(x*Height)+y]);
}
/* The height has now been doubled */
Height*=2;
FwdNussFFT(TData1,Height,Width);
FwdNussFFT(TData2,Height,Width);
/*
** Compute the negacyclic product of each row.
** We do this by recursing, but really we could call any negacylcic
** multiplication routine.
*/
for (y = 0; y < Height; y++)
NegaCyclicNussMul(&RESULT(y,0), &DATA1(y,0), &DATA2(y,0), Width);
RevNussFFT(TResult,Height,Width);
Height/=2;
/* Make the result negacyclic and compact. */
for (y = 0; y < Height; y++)
{
N_SubDDD(Result[y], RESULT(y, 0), RESULT(y+Height, Width-1));
for (x = 1; x < Width; x++)
N_AddDDD(Result[(x*Height) + y], RESULT(y, x), RESULT(y+Height, x-1));
}
#undef DATA1
#undef DATA2
#undef RESULT
free(TData1);free(TData2);free(TResult);
}
void SimpleCyclicMul(D_DIGIT *Prod, S_DIGIT *Data1, S_DIGIT *Data2, UINT32 Len)
/* Plain old n^2 schoolboy for example purposes */
{UINT32 p,i,q;
D_DIGIT Temp;
if (Len > CYCLIC_THRESHOLD)
{
printf("hmmm... SimpleCyclicMul() called with a length larger than expected.\n");
}
/* We could do a FFT here. */
for (p=0;p 0)
if (Len <= CYCLIC_THRESHOLD)
{SimpleCyclicMul(Prod,Data1,Data2,Len);return;}
#else
/* If we don't want to do a threshold, hardwire a couple of sizes. */
if (Len == 1)
{/* Basic single digit multiply. */
N_MulDSS(Prod[0],Data1[0], Data2[0]);
return;
}
if (Len == 2)
{ /* Double digit mul. */
/*
** Prod[0] := ((Data1[0]+Data1[1])*(Data2[0]+Data2[1]) +
** (Data1[0]-Data1[1])*(Data2[0]-Data2[1])) / 2
** Prod[1] := ((Data1[0]+Data1[1])*(Data2[0]+Data2[1]) -
** (Data1[0]-Data1[1])*(Data2[0]-Data2[1])) / 2
*/
S_DIGIT Sum1,Sum2,Dif1,Dif2;
D_DIGIT Prod1, Prod2;
N_Add(Sum1, Data1[0], Data1[1]);
N_Sub(Dif1, Data1[0], Data1[1]);
N_Add(Sum2, Data2[0], Data2[1]);
N_Sub(Dif2, Data2[0], Data2[1]);
N_MulDSS(Prod1, Sum1, Sum2);
N_MulDSS(Prod2, Dif1, Dif2);
N_AddDDD(Prod[0], Prod1, Prod2);
N_Div2(Prod[0], Prod[0]);
N_SubDDD(Prod[1], Prod1, Prod2);
N_Div2(Prod[1], Prod[1]);
return;
}
#endif
/* Break the data into the two half sized parts. */
for (x = 0; x < HLen; x++)
{ S_DIGIT Tmp;
N_Sub(Tmp, Data1[x], Data1[x + HLen]);
N_Add(Data1[x],Data1[x], Data1[x + HLen]);
N_Copy(Data1[x+HLen],Tmp);
}
if (Data1 != Data2) /* Don't do it if we are squaring */
for (x = 0; x < HLen; x++)
{ S_DIGIT Tmp;
N_Sub(Tmp, Data2[x], Data2[x + HLen]);
N_Add(Data2[x], Data2[x], Data2[x + HLen]);
N_Copy(Data2[x+HLen],Tmp);
}
/*
** At the very first call to this function, when the data is zero
** padded, at this point, we are now calling Cyclic and NegaCyclic
** without zero padding. Same data, just no zero padding.
*/
CyclicNussMul(&Prod[0], &Data1[0], &Data2[0], HLen);
NegaCyclicNussMul(&Prod[HLen], &Data1[HLen], &Data2[HLen], HLen);
/*
** Now do the Chinese Remainder Theorem and put he answers together.
*/
for (x = 0; x < HLen; x++)
{D_DIGIT Sum,Diff;
N_AddDDD(Sum, Prod[x], Prod[x + HLen]);
N_SubDDD(Diff, Prod[x], Prod[x + HLen]);
N_Div2(Prod[x], Sum);
N_Div2(Prod[x + HLen], Diff);
}
}
#define CalcNUSSLen(_NumLen) ((_NumLen)*2)
void
FFTMul(Short * Prod, Short * Num1, Short * Num2,int Len)
/*
** Do a Nussbaumer convolution.
** Data is big-endian
*/
{
int x, NUSSLen = CalcNUSSLen(Len);
double Carry, Pyramid;
for (x = 0; x < NUSSLen; x++) NUSSNum1[x]=NUSSNum2[x]=0;
for (x = 0; x < Len; x++) NUSSNum1[x]=Num1[Len-x-1];
for (x = 0; x < Len; x++) NUSSNum2[x]=Num2[Len-x-1];
/* Put our data into FFT in little endian format & zero pad */
/*
** Do the convolution... Take your pick which function to use!
** The cyclic version requires fewer MUL ops, but it has higher
** overhead. The NegaCyclic version (ie: Nussbaumer itself)
** requires a few more operations but has less overhead.
**
NegaCyclicNussMul(NUSSProd,NUSSNum1,NUSSNum2,NUSSLen);
*/
CyclicNussMul(NUSSProd,NUSSNum1,NUSSNum2,NUSSLen);
/*
** Now release our carries, and store the results in the 'prod' array.
*/
Carry = 0.0;
for (x = NUSSLen; x > 0; x--)
{
Pyramid = NUSSProd[NUSSLen - x] + Carry;
/* No need to round Pyramid, like with a FFT. It's already an integer */
Carry = floor(Pyramid / BASE);
/* floor() is a very slow function on the x86/x87 */
Prod[x - 1] = Pyramid - Carry * BASE;
}
}
void
InitFFT(unsigned long int Len,int Base,int BaseDig)
{int Bytes;
BASE=Base;
BASE_DIG=BaseDig;
Bytes=sizeof(S_DIGIT)*CalcNUSSLen(Len);
if (BaseDig > 4)
{
printf("Error: The fft is slightly hardwired for <= 4 digits in the base.\n");
exit(0);
}
if ((BaseDig == 4) && (Len >= 131072))
{
printf("Warning: Due to the data type sizes in this example program, Nussbaumer\n");
printf("convolution fails when the base is >= 10,000 and the length is >= 65,536.\n");
}
NUSSNum1=(S_DIGIT*)malloc(Bytes);
NUSSNum2=(S_DIGIT*)malloc(Bytes);
NUSSProd=(D_DIGIT*)malloc(CalcNUSSLen(Len)*sizeof(D_DIGIT));
if ((NUSSNum1==NULL) || (NUSSNum2==NULL) || (NUSSProd==NULL))
{
printf("Unable to allocate memory for FFTNum.\n");
printf("Len=%d Bytes=%d\n",(int)Len,(int)Bytes);
exit(0);
}
}
void DeInitFFT(unsigned long int Len)
{
free(NUSSNum1);free(NUSSNum2);free(NUSSProd);
}
#if 0
/* Number of multiplications for Nussbaumer. */
double
CalcCycNussMuls(double Length)
{int L2;
if (Length<=0) return 0;
if (Length==1) return 1;
if (Length==2) return 2;
L2=log(Length/2)/log(2.0)+0.01;
return 3.0*pow(2.0,L2-1+ceil(log(L2)/log(2.0)))+CalcCycNussMuls(Length/2.0);
}
double
CalcNegNussMuls(double Length)
/* Number of simple muls for the negacycle Nussbaumer */
{int L2;
if (Length<=0) return 0;
if (Length==1) return 1;
if (Length==2) return 3;
L2=log(Length)/log(2.0)+0.01;
return 3.0*Length/2*pow(2.0,ceil(log(L2)/log(2.0)));
/* The pow(ceil()) raises to the next square pow of 2. */
}
double
CalcNegNussMuls(int Length)
/* A more natural way to compute it. */
{int L2,Height;
if (Length<=0) return 0;
if (Length==1) return 1;
if (Length==2) return 3;
/* If we don't recurse all the way, you can put a threshold limit here. */
Height = (UINT32) 1 << (Log2(Length) / 2);
/* Height can be hardwired at 4 or larger. */
return Height*2*CalcNegNussMuls(Length/Height);
}
/*
Len Neg Cyc
2 == 3 2
4 == 12 5
8 == 48 17
16 == 96 65
32 == 384 161
64 == 768 545
128 ==1536 1313
256 ==3072 2849
*/
#endif