Hello, I would like to use CUDA to do many (say 200,000) 1-D FFT of size 256 simultaneously. However, there seems to be a limit of the batch number, which is 65,535, if the batch number is larger than this number, then results are completely wrong. Anyone know what’s going on?
Here are the configuration of my video card and my code:
[codebox]
Device 0: “Tesla C1060”
CUDA Driver Version: 2.30
CUDA Runtime Version: 2.30
CUDA Capability Major revision number: 1
CUDA Capability Minor revision number: 3
Total amount of global memory: 4294705152 bytes
Number of multiprocessors: 30
Number of cores: 240
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 16384
Warp size: 32
Maximum number of threads per block: 512
Maximum sizes of each dimension of a block: 512 x 512 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 262144 bytes
Texture alignment: 256 bytes
Clock rate: 1.30 GHz
Concurrent copy and execution: Yes
Run time limit on kernels: No
Integrated: No
Support host page-locked memory mapping: Yes
Compute mode: Default (multiple host threads can use this device simultaneously)
[/codebox]
[codebox]
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <cutil_inline.h>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
#define SIGNAL_SIZE 256
#define N_SIGNAL 65536
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
runTest(argc, argv);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv)
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// Allocate host memory for the signal
float* h_signal = (float *)malloc(sizeof(float)*(SIGNAL_SIZE+2)*N_SIGNAL);
// Initalize the memory for the signal
for (unsigned int i = 0; i < N_SIGNAL; i++) {
float fl = 0.1+0.3*i/float(N_SIGNAL);
for (unsigned int j = 0; j < SIGNAL_SIZE+2; j++) {
h_signal[i*(SIGNAL_SIZE+2)+j] = sin(2*3.1416*fl*j);
}
}
// Allocate device memory for signal
float* d_signal;
cutilSafeCall(cudaMalloc((void**)&d_signal, sizeof(float)*(SIGNAL_SIZE+2)*N_SIGNAL));
if ( cudaSuccess == cudaGetLastError() ) { printf("Success!\n"); }
else { printf("Error %d\n", cudaGetLastError()); }
// Copy host memory to device
cutilSafeCall(cudaMemcpy(d_signal, h_signal, sizeof(float)*(SIGNAL_SIZE+2)*N_SIGNAL, cudaMemcpyHostToDevice));
if ( cudaSuccess == cudaGetLastError() ) { printf("Success!\n"); }
else { printf("Error %d\n", cudaGetLastError()); }
// CUFFT plan
cufftHandle plan;
cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_R2C, N_SIGNAL);
if ( cudaSuccess == cudaGetLastError() ) { printf("Success!\n"); }
else { printf("Error %d\n", cudaGetLastError()); }
// FFT
cufftExecR2C(plan, (cufftReal *)d_signal, (cufftComplex *)d_signal);
printf("Error: %s\n", cudaGetErrorString(cudaGetLastError()));
// Copy device memory to host
cutilSafeCall(cudaMemcpy(h_signal, d_signal, sizeof(float)*(SIGNAL_SIZE+2)*N_SIGNAL, cudaMemcpyDeviceToHost));
if ( cudaSuccess == cudaGetLastError() ) { printf("Success!\n"); }
else { printf("Error %d\n", cudaGetLastError()); }
for (unsigned int i = 0; i < N_SIGNAL; i++) {
float fl = 0.1+0.3*i/float(N_SIGNAL);
for (unsigned int j = 0; j < (SIGNAL_SIZE+2)/2; j++) {
printf("i=%5d fl=%10.4f j=%5d f=%10.4f h[i]=%10.4f %10.4f\n", i, fl, j, j/float(SIGNAL_SIZE), h_signal[i*(SIGNAL_SIZE+2)+j*2], h_signal[i*(SIGNAL_SIZE+2)+j*2+1]);
}
printf("\n");
}
//Destroy CUFFT context
cufftSafeCall(cufftDestroy(plan));
// cleanup memory
free(h_signal);
cutilSafeCall(cudaFree(d_signal));
cudaThreadExit();
}
[/codebox]