CUFFT BENCHMARKING TOOL v1.0 8800GTX owners please post your results!

EDIT: GeForce 8800 GTS, GeForce 8800 GTX, GeForce 8800 Ultra, and Quadro FX5600 results are now posted below.

Although not a G8x owner myself (yet!), I am very interested to know how quickly an 8800GTX could perform 1D FFTs with 128K elements, now that the 16K limit has been removed.

For the benefit of all, I’ve written the attached benchmarking tool and invite anyone with an 8800-series card to run it and post your results.

#define WIN32_LEAN_AND_MEAN

#include <windows.h>

#include <cufft.h>

#include <cutil.h>

#define MIN_NX 1024

#define MAX_NX 262144

#define MIN_BATCH 1

#define MAX_BATCH 64

#define FFTS_PER_TEST 32

int main(int argc, char **argv)

	{

	// force the program to run on a single processor 

	DWORD processAffinityMask;

	DWORD systemAffinityMask;

	GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);

	if (processAffinityMask!=1)

  {

  SetProcessAffinityMask(GetCurrentProcess(), 1);

  system(argv[0]);

  return 0;

  }

	// introduce the program

	printf("\nCUFFT BENCHMARKING TOOL v1.0\n\n");

	printf("This program evaluates the utility of using CUDA devices as\n");

	printf("FFT coprocessors for digital signal processing applications.\n");

	printf("Each table entry is an estimate of the maximum number of FFTs\n");

	printf("that can be performed per second, considering both the time\n");

	printf("needed to calculate the FFTs and the time needed to copy data\n");

	printf("to and from the CUDA device.\n\n");

	printf("To minimize interference from the OS and other programs, each\n");

	printf("estimate is based on the fastest of %i identical calculations.\n", FFTS_PER_TEST);

	// perform CUDA device initialization

	CUT_DEVICE_INIT();

	// display CUDA device info

	int deviceCount;

	CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));

	for (int dev = 0; dev < deviceCount; ++dev)

  {

  cudaDeviceProp deviceProp;

  CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));

  printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

  printf("  Major revision number:                         %d\n", deviceProp.major);

  printf("  Minor revision number:                         %d\n", deviceProp.minor);

  printf("  Total amount of global memory:                 %d bytes\n", deviceProp.totalGlobalMem);

  printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);

  }

	// initialize host PC arrays

	int hostArrayLength = MAX_NX * MAX_BATCH;

	cufftComplex *hostArrayA = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex));

	cufftComplex *hostArrayB = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex));

	float *element = (float *)hostArrayA;

	float *elementLimit = (float *)(hostArrayA+hostArrayLength);

	while (element<elementLimit)

  *element++ = rand();

	// run timing tests for in-place and out-of-place FFTs

	for (int out_of_place=0; out_of_place<2; ++out_of_place)

  {

  // print table headings

  printf("\n");

  printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("---------", batch); printf("\n");

  if (out_of_place)

  	printf("1D Complex-to-Complex Out-of-Place FFTs\n");

  else

  	printf("1D Complex-to-Complex In-Place FFTs\n");

  printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("---------", batch); printf("\n");

  printf(" nx     | batch\n");

  printf("        "); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n");

  printf("        "); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("|%8i", batch); printf("\n");

  printf("--------"); for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n");

 // run timing tests for a variety of FFT array lengths

  for (int nx=MIN_NX; nx<MAX_NX+1; nx*=2)

  	{

  	printf("%8i", nx);

 	// run timing tests for a variety of batch settings

  	for (int batch=MIN_BATCH; batch<MAX_BATCH+1; batch*=4)

    {

    // generate CUFFT plan

    cufftHandle plan;

    CUFFT_SAFE_CALL(cufftPlan1d(&plan, nx, CUFFT_C2C, batch));

   // allocate arrays on host PC and CUDA device, fill host array with random data

    size_t arraySize = sizeof(cufftComplex) * nx * batch;

    cufftComplex *deviceArrayA;

    cufftComplex *deviceArrayB;

    CUDA_SAFE_CALL(cudaMalloc((void**)&deviceArrayA, arraySize));

    if (out_of_place)

    	CUDA_SAFE_CALL(cudaMalloc((void**)&deviceArrayB, arraySize));

   // run a series of identical timing tests, looking for the fastest one (the one with the least OS interference)

    int fastestRateFound = 0;

    char *spinner = "|/-\";

    int spindex = 0;

    for (int rep=0; rep<FFTS_PER_TEST; ++rep)

    	{

    	__int64 startCount;

    	__int64 stopCount;

    	__int64 countsPerSec;

    	QueryPerformanceFrequency((LARGE_INTEGER *)&countsPerSec);

   	if (out_of_place)

      {

      QueryPerformanceCounter((LARGE_INTEGER *)&startCount);

      CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice));

      CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayB, CUFFT_FORWARD));

      CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayB, arraySize, cudaMemcpyDeviceToHost));

      QueryPerformanceCounter((LARGE_INTEGER *)&stopCount);

      }

    	else

      {

      QueryPerformanceCounter((LARGE_INTEGER *)&startCount);

      CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice));

      CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayA, CUFFT_FORWARD));

      CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayA, arraySize, cudaMemcpyDeviceToHost));

      QueryPerformanceCounter((LARGE_INTEGER *)&stopCount);

      }

   	int fftsPerSec = (int)( (countsPerSec*batch) / (stopCount-startCount) );

    	if (fastestRateFound<fftsPerSec)

      fastestRateFound = fftsPerSec;

   	// provide some entertainment

    	printf("%c\b", spinner[spindex++]);

    	if (spindex==4)

      spindex = 0;

    	}

    printf("|%8i", fastestRateFound);

   CUFFT_SAFE_CALL(cufftDestroy(plan));

    CUDA_SAFE_CALL(cudaFree(deviceArrayA));

    if (out_of_place)

    	CUDA_SAFE_CALL(cudaFree(deviceArrayB));

    }

  	printf("\n");

  	}

  printf("--------+--------"); for (int batch=MIN_BATCH+1; batch<MAX_BATCH+1; batch*=4) printf("+--------", batch); printf("\n");

  }

	free(hostArrayA);

	free(hostArrayB);

	printf("\nPress ENTER to exit...\n");

	fflush( stdout);

	fflush( stderr);

	getchar();

	exit(EXIT_SUCCESS);

	}

I did a quick conversion to Linux, I am not 100% sure about the timing.

I only have a Quadro FX5600 in my system :) , no GTX, but these numbers should be pretty close to the GTX.

CUFFT BENCHMARKING TOOL v1.0

This program evaluates the utility of using CUDA devices as

FFT coprocessors for digital signal processing applications.

Each table entry is an estimate of the maximum number of FFTs

that can be performed per second, considering both the time

needed to calculate the FFTs and the time needed to copy data

to and from the CUDA device.

To minimize interference from the OS and other programs, each

estimate is based on the fastest of 32 identical calculations.

Device 0: "Quadro FX 5600"

  Major revision number:                         1

  Minor revision number:                         0

  Total amount of global memory:                 1609891840 bytes

  Clock rate:                                    1350000 kilohertz

--------+-----------------------------------

1D Complex-to-Complex In-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

--------+--------+--------+--------+--------

    1024|   14492|   37735|   62015|   70953

    2048|    9803|   21857|   30651|   29130

    4096|    7352|   13605|   15810|   15417

    8192|    4999|    7272|    7210|    7792

   16384|    3164|    3487|    3357|    3438

   32768|    1745|    1669|    1765|    1789

   65536|     938|     903|     924|     924

  131072|     424|     456|     464|     462

  262144|     207|     212|     213|     212

--------+--------+--------+--------+--------

--------+-----------------------------------

1D Complex-to-Complex Out-of-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

--------+--------+--------+--------+--------

    1024|   14285|   37383|   61776|   70484

    2048|    9615|   21739|   30592|   29698

    4096|    7299|   13651|   15794|   15425

    8192|    4999|    7285|    7216|    7796

   16384|    3154|    3469|    3352|    3444

   32768|    1754|    1679|    1775|    1798

   65536|     943|     905|     931|     935

  131072|     426|     461|     466|     465

  262144|     206|     211|     212|     212

--------+--------+--------+--------+--------

With this application using page-locked (pinned) memory will result in a big boost:

Device 0: "Quadro FX 5600"

  Major revision number:                         1

  Minor revision number:                         0

  Total amount of global memory:                 1609891840 bytes

  Clock rate:                                    1350000 kilohertz

--------+-----------------------------------

1D Complex-to-Complex In-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

--------+--------+--------+--------+--------

    1024|   16129|   51282|  115942|  147465

    2048|   11235|   32520|   56737|   64842

    4096|    9259|   23255|   29574|   32454

    8192|    7142|   12903|   15065|   16137

   16384|    5076|    5934|    6177|    6254

   32768|    3012|    3246|    3328|    3343

   65536|    1692|    1766|    1792|    1780

  131072|     868|     890|     902|     895

  262144|     380|     384|     384|     382

--------+--------+--------+--------+--------

--------+-----------------------------------

1D Complex-to-Complex Out-of-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

--------+--------+--------+--------+--------

    1024|   16129|   51282|  116788|  144469

    2048|   11235|   32520|   56537|   64451

    4096|    9259|   23391|   29629|   32569

    8192|    7194|   12861|   14981|   16149

   16384|    5102|    5856|    6196|    6257

   32768|    3012|    3265|    3366|    3391

   65536|    1683|    1793|    1822|    1827

  131072|     876|     903|     909|     911

  262144|     378|     382|     382|     380

--------+--------+--------+--------+--------

Just allocate the memory with:

#ifdef PINNED

cufftComplex *hostArrayA;

cudaMallocHost((void **) &hostArrayA,hostArrayLength* sizeof(cufftComplex));

cufftComplex *hostArrayB;

cudaMallocHost((void **) &hostArrayB,hostArrayLength* sizeof(cufftComplex));

#else

cufftComplex *hostArrayA = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex));

cufftComplex *hostArrayB = (cufftComplex *)calloc(hostArrayLength, sizeof(cufftComplex));

#endif

and free the memory with:

#ifdef PINNED 

cudaFreeHost(hostArrayA);

cudaFreeHost(hostArrayB);

#else

free(hostArrayA);

free(hostArrayB);

#endif

Since originating this thread and reading mfatica’s results, I’ve gone ahead and purchased an 8800 GTS and have an 8800 GTX on order. I’ll post the GTX results when the card arrives, but I’m posting the GTS results now.

These tests were done using page-locked (pinned) memory (thanks mfatica!) and show that the Quadro FX 5600 is roughly 37% faster than a GeForce 8800 GTS for 1D 64K/128K/256K FFTs.

CUFFT BENCHMARKING TOOL v1.0

This program evaluates the utility of using CUDA devices as

FFT coprocessors for digital signal processing applications.

Each table entry is an estimate of the maximum number of FFTs

that can be performed per second, considering both the time

needed to calculate the FFTs and the time needed to copy data

 to and from the CUDA device.

To minimize interference from the OS and other programs, each

estimate is based on the fastest of 32 identical calculations.

Device 0: "GeForce 8800 GTS"

  Major revision number:                         1

  Minor revision number:                         0

  Total amount of global memory:                 335216640 bytes

  Clock rate:                                    1188000 kilohertz

--------+-----------------------------------

1D Complex-to-Complex In-Place FFTs

--------+-----------------------------------

  nx     | batch

         +--------+--------+--------+--------

         |       1|       4|      16|      64

--------+--------+--------+--------+--------

     1024|    2309|   14100|   42173|   84285

     2048|    3139|   11961|   28289|   43213

     4096|    3122|    9121|   17542|   22795

     8192|    2810|    6643|   10001|   11609

    16384|    2027|    3600|    4262|    4464

    32768|    1548|    2209|    2451|    2524

    65536|    1027|    1257|    1341|    1356

   131072|     569|     644|     665|     667

   262144|     258|     271|     274|     275

--------+--------+--------+--------+--------

--------+-----------------------------------

1D Complex-to-Complex Out-of-Place FFTs

--------+-----------------------------------

  nx     | batch

         +--------+--------+--------+--------

         |       1|       4|      16|      64

--------+--------+--------+--------+--------

     1024|    2309|   14058|   42151|   83766

     2048|    3336|   11933|   28134|   42854

     4096|    3103|    9040|   17504|   22952

     8192|    2803|    6621|   10023|   11721

    16384|    2124|    3606|    4288|    4507

    32768|    1550|    2230|    2499|    2564

    65536|    1033|    1271|    1355|    1381

   131072|     566|     642|     662|     668

   262144|     255|     267|     270|     270

--------+--------+--------+--------+--------

Here are the GeForce 8800GTX benchmark results, again using page-locked (pinned) memory.

Device 0: "GeForce 8800 GTX"

  Major revision number:                         1

  Minor revision number:                         0

  Total amount of global memory:                 804978688 bytes

  Clock rate:                                    1350000 kilohertz

--------+-----------------------------------

1D Complex-to-Complex In-Place FFTs

--------+-----------------------------------

  nx     | batch

         +--------+--------+--------+--------

         |       1|       4|      16|      64

--------+--------+--------+--------+--------

     1024|    2885|   13695|   43645|   88782

     2048|    3100|   11759|   30192|   47404

     4096|    3076|    9984|   19349|   25740

     8192|    2791|    7183|   11234|   13252

    16384|    2153|    4183|    5157|    5471

    32768|    1681|    2527|    2855|    2947

    65536|    1139|    1440|    1546|    1568

   131072|     649|     750|     781|     784

   262144|     316|     335|     341|     343

--------+--------+--------+--------+--------

--------+-----------------------------------

1D Complex-to-Complex Out-of-Place FFTs

--------+-----------------------------------

  nx     | batch

         +--------+--------+--------+--------

         |       1|       4|      16|      64

--------+--------+--------+--------+--------

     1024|    2189|   13674|   43605|   88677

     2048|    3323|   11752|   30075|   47368

     4096|    3066|    9961|   19332|   25785

     8192|    2780|    7158|   11216|   13378

    16384|    2262|    4174|    5141|    5482

    32768|    1676|    2531|    2862|    2959

    65536|    1140|    1459|    1564|    1592

   131072|     650|     750|     784|     790

   262144|     311|     331|     336|     338

--------+--------+--------+--------+--------

Finally, here’s how much faster the 8800GTX and FX5600 are, when compared to the 8800GTS:

--------------------------

CUFFT Performance Increase

Relative to an 8800GTS

--------+-----------------

  nx     | batch: 4

         +--------+--------

         | 8800GTX|  FX5600

--------+--------+--------

     1024|      6%|     74%

     2048|     10%|     50%

     4096|     13%|     42%

     8192|     14%|     38%

    16384|     22%|     39%

    32768|     16%|     32%

    65536|     15%|     32%

   131072|     18%|     35%

   262144|     25%|     40%

--------+--------+--------

CClark

What are your bandwidthTest scores on the FX5600 (dtoh and htod) and is that in an x16 slot?

Thanks,

The 8800GTS and 8800GTX tests were done in the x16 slot of a server-class Dell machine with 2GB RAM and a dual-core Xeon running at 3.4GHz. mfatica did the FX5600 test, but I’m fairly certain that that too was an x16 slot. I haven’t looked for a bandwidth test because I was (and am) specifically interested in 1D FFT throughput.

Do you have a particular test program in mind, in case mfatica revisits this thread?

These are the bandwidth numbers using the example from the SDK for both pageable and pinned memory. OS is RHEL4 32 bit

$ ./bandwidthTest 

Quick Mode

Host to Device Bandwidth for Pageable memory

.

Transfer Size (Bytes)   Bandwidth(MB/s)

 33554432               1177.0

Quick Mode

Device to Host Bandwidth for Pageable memory

.

Transfer Size (Bytes)   Bandwidth(MB/s)

 33554432               1050.1

Quick Mode

Device to Device Bandwidth

.

Transfer Size (Bytes)   Bandwidth(MB/s)

 33554432               63222.4

$ ./bandwidthTest --memory=pinned

Quick Mode

Host to Device Bandwidth for Pinned memory

.

Transfer Size (Bytes)   Bandwidth(MB/s)

 33554432               3184.7

Quick Mode

Device to Host Bandwidth for Pinned memory

.

Transfer Size (Bytes)   Bandwidth(MB/s)

 33554432               3106.3

Quick Mode

Device to Device Bandwidth

.

Transfer Size (Bytes)   Bandwidth(MB/s)

 33554432               63203.6

Dear mfatica, are those lines the only ones you changed for Linux version? My professor would like to have the full version of this code. Let me know if you change anything else.(Coz I am not expert in Linux.)

Thanks,

StrikerBlitz

[quote name=‘cclark’ date=‘Aug 3 2007, 04:33 PM’]


if (out_of_place)

 {

 QueryPerformanceCounter((LARGE_INTEGER *)&startCount);

 CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice));

 CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayB, CUFFT_FORWARD));

 CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayB, arraySize, cudaMemcpyDeviceToHost));

 QueryPerformanceCounter((LARGE_INTEGER *)&stopCount);

 }

else

 {

 QueryPerformanceCounter((LARGE_INTEGER *)&startCount);

 CUDA_SAFE_CALL(cudaMemcpy(deviceArrayA, hostArrayA, arraySize, cudaMemcpyHostToDevice));

 CUFFT_SAFE_CALL(cufftExecC2C(plan, deviceArrayA, deviceArrayA, CUFFT_FORWARD));

 CUDA_SAFE_CALL(cudaMemcpy(hostArrayB, deviceArrayA, arraySize, cudaMemcpyDeviceToHost));

 QueryPerformanceCounter((LARGE_INTEGER *)&stopCount);

 }

Dear cclark, may I ask what memory model is used when these few lines are called? Are they device memory, shared memory, or global memory?

Thanks.

StrikerBlitz,

I compiled this as a Win32 console program using Visual C++ 2005 Express Edition and the CUDA SDK. hostArrayA and hostArrayB are in the computer’s RAM, while deviceArrayA and deviceArrayB are in the video card’s RAM. For more details about how these calls are implemented by CUDA, I’ll have to refer you to the CUDA documentation. (I know just enough about CUDA now to tinker with the sample programs.)

cclark

According to the CUDA manual the 8800 is capable of 52GFlops running benchFFT. From the results above I calculate a peak GFLOPs of just over 10. Is this correct?

Here is how I came to these results,

Taking nx = 262144, batch = 4, number of FFTs per second = 384

GFLOPS = number of floating point operations per second

        =((nx * (LOG(nx)/LOG(2)) *6))/ (1/numberofFFTs * 1000000)/1000

        = ( 6 * 262144 * log2(262144)) /  (1/ 260.4) /1000 = 10.87 GFLOPS

The best result I can get is 11.97GFLOPS with nx = 262144 and batch = 64

How does NVIDIA get to 52? or is my maths horribly wrong?

Cheers,

Chris

This is the a version of the code with the timer from the cutil library ( it should work under Linux and Windows). You will need to change the file suffix back to .cu.

Is that code not doing the exact same thing as ccclarks code. By that I mean the same FFT and recording the same results but displaying them differently?

That gives the number of FFT’s done each second and from that you can calculate the number of GFLOPS from the formula I provided earlier (I’ve been kind and used 6 rather than 5 operations like you should for BenchFFT but it should really be 5).

I’ve not got my 8 series GPU yet so I can’t test your code and I’ve only really had a flick through it. Does it give results of 52 GFLOPS for benchFFT?

For reference the IBM Cells performance is 22 GFLOPS for 1-d non real benchFFT and a 3.0GHz core duo is 14.5 GFLOPS. MOre info on benchFFT (including the formula I used in my last post) is at BenchFFT results

Chris

Sorry, I posted the code for StrikerBlitz.
The code is exactly the same, just with a different timer that works under Linux.

The 52Gflops was for a batch of 4000 transforms of 1024 elements on an Ultra.

I might be making a mistake here but that sounds very high. What formula do you use to calculate GFLOPS?

Looking at the data returned for the Quadro FX5600 with pinned memory (which shouldn’t be a world away from the ULTRA)

you get results for nx = 1024 and batch =1,4,16,64 respectively of 16129, 51282, 115942, 147465. These results are getting closer together (as would be expected) with the larger batches. The difference between one and four is *3.18, between four and 16 2.26 and between 16 and 64 1.27. Assuming increasing the batch number continued to increase by a power of 4 at 1.27 (so 256, 1024, 4096) optimistically I would be looking for something around 1474651.271.271.27 = 302065.

Plugging that into the benchFFT formula for GFLOPS gives 18.56GFLOPS.

Chris

The 52Gflops are on the device with no I/O, using the 5NlogN formula

Thanks, thats what I was wanting to know.

Cheers,

Chris

What am I missing here? Why is the FX5600 performing better than the 8800GTX in this comparison?

I would presume it’s the memory bandwidth of the FX5600 but I’m not entirely sure. It’s quite a large difference in performance.

Chris

I’ve got results for the GeForce 8800 Ultra. I used the unmodified code in the first post for the first test:

Board: Intel Workstation Board S5000XVN | Chipset: Intel 5000X Chipset

Device 0: "GeForce 8800 Ultra"

  Major revision number:                         1

  Minor revision number:                         0

  Total amount of global memory:                 804978688 bytes

  Clock rate:                                    1512000 kilohertz

--------+-----------------------------------

1D Complex-to-Complex In-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

-------+--------+--------+--------+--------

    1024|   14617|   40262|   73081|   84974

    2048|   10333|   24995|   36644|   38965

    4096|    7865|   15974|   18937|   20556

    8192|    5716|    8677|    9369|   10578

   16384|    3702|    4092|    4347|    4595

   32768|    2068|    2155|    2351|    2466

   65536|    1120|    1188|    1264|    1312

  131072|     567|     622|     651|     655

  262144|     279|     296|     307|     305

-------+--------+--------+--------+--------

--------+-----------------------------------

1D Complex-to-Complex Out-of-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

-------+--------+--------+--------+--------

    1024|   14775|   40259|   73140|   84774

    2048|   10489|   25265|   36650|   39190

    4096|    7828|   15936|   18952|   20600

    8192|    5709|    8692|    9419|   10706

   16384|    3709|    4107|    4352|    4594

   32768|    2085|    2163|    2381|    2480

   65536|    1124|    1201|    1292|    1302

  131072|     562|     616|     654|     667

  262144|     281|     297|     308|     311

-------+--------+--------+--------+--------

Press ENTER to exit...

And as for the “pinned” test:

(I’m not sure if I understood"pinned" correctly - what I did was replace the calloc for hostArrayA and hostArrayB with cudaMallocHost calls)

Device 0: "GeForce 8800 Ultra"

  Major revision number:                         1

  Minor revision number:                         0

  Total amount of global memory:                 804978688 bytes

  Clock rate:                                    1512000 kilohertz

--------+-----------------------------------

1D Complex-to-Complex In-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

-------+--------+--------+--------+--------

    1024|   15821|   50443|  114128|  147199

    2048|   11608|   33272|   57633|   65669

    4096|    9293|   23330|   30164|   33120

    8192|    7360|   13166|   15334|   16460

   16384|    5239|    6107|    6370|    6452

   32768|    3091|    3351|    3446|    3468

   65536|    1749|    1828|    1853|    1857

  131072|     906|     928|     935|     933

  262144|     419|     423|     423|     423

-------+--------+--------+--------+--------

--------+-----------------------------------

1D Complex-to-Complex Out-of-Place FFTs

--------+-----------------------------------

 nx     | batch

        +--------+--------+--------+--------

        |       1|       4|      16|      64

-------+--------+--------+--------+--------

    1024|   15938|   50572|  113974|  144755

    2048|   11615|   33726|   57516|   65738

    4096|    9304|   23278|   30111|   33170

    8192|    7368|   13156|   15368|   16517

   16384|    5246|    6052|    6379|    6452

   32768|    3086|    3359|    3454|    3479

   65536|    1750|    1849|    1878|    1882

  131072|     910|     931|     940|     940

  222144|     420|     423|     424|     424

-------+--------+--------+--------+--------

Press ENTER to exit...