Hi All;
I have a problem with a program using cufft in it.
[codebox]#include // original
#include “main.h”
using namespace std;
int main(int argc, char *argv)
{
int NX, NY, NZ; // define the fft limits
// fft limits
NX= 64;
NY= 64;
NZ= 128;
int NXYZ; // define the fft limits
NXYZ = NX* NY* NZ; // defining more spaces than required is possible
cout << " NXYZ " << NXYZ << endl;
cufftComplex data[NXYZ], data_1[NXYZ]; // data variables on the cpu
// GPU memory allocation 3d
cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*(NX*NY*NZ));
cudaMalloc((void**)&devPtr_1, sizeof(cufftComplex)*NX*NY*NZ);
/* creates 3D FFT plan */
cufftPlan3d(&plan, NX, NY, NZ, CUFFT_C2C);
cufftPlan3d(&plan_1, NX, NY, NZ, CUFFT_C2C);
// source data creation or fill the vector
for(i= 0 ; i < NX*NY*NZ ; i++){
// data part 1
data[i].x = 2.0f; // real part
data[i].y = 1.0f; // imag part
// data part 2
data_1[i].x = 4.0f;
data_1[i].y = 5.0f;
}
// transfer to GPU memory _
cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyHostToDevice);
cudaMemcpy(devPtr_1, data_1, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyHostToDevice);
/* executes FFT processes */
cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);
cufftExecC2C(plan_1, devPtr_1, devPtr_1, CUFFT_FORWARD);
/* executes FFT processes (inverse transformation) */
cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);
cufftExecC2C(plan_1, devPtr_1, devPtr_1, CUFFT_INVERSE);
/* transfer results from GPU memory */
cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyDeviceToHost);
cudaMemcpy(data_1, devPtr_1, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyDeviceToHost);
/* deletes CUFFT plan */
cufftDestroy(plan);
cufftDestroy(plan_1);
/* frees GPU memory */
cudaFree(devPtr);
cudaFree(devPtr_1);
for(i = 0 ; i < NXNYNZ ; i += 10000){
cout << "data " << i << " "<< data[i].x << " "<< data[i].y << " "
<< "data_1 " << i << " "<< data_1[i].x << " "<< data_1[i].y
<< endl;
}
return 0;
}[/codebox]
The program running good on a card and system with the following specs:
Centos 5.3; GeForce 9800 GT; code::Blocks 8.02; gcc 4.1.2; cuda 2.3
But when I run it on another machine with this specs
Ubuntu 9.04; tesla c1060; cuda 2.3
the system throughs the segmentation error. :blink:
Segmentation fault
the code I am using on this last machine is
[codebox]#include // original
#include “main.h”
using namespace std;
int main(int argc, char *argv)
{
cudaSetDevice(0);
int NX, NY, NZ; // define the fft limits
// fft limits
NX= 64;
NY= 64;
NZ= 128;
int NXYZ; // define the fft limits
NXYZ = NX* NY* NZ; // defining more spaces than required is possible
cout << " NXYZ " << NXYZ << endl;
cufftComplex data[NXYZ], data_1[NXYZ]; // data variables on the cpu
// GPU memory allocation 3d
cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*(NX*NY*NZ));
cudaMalloc((void**)&devPtr_1, sizeof(cufftComplex)*NX*NY*NZ);
/* creates 3D FFT plan */
cufftPlan3d(&plan, NX, NY, NZ, CUFFT_C2C);
cufftPlan3d(&plan_1, NX, NY, NZ, CUFFT_C2C);
// source data creation or fill the vector
for(i= 0 ; i < NX*NY*NZ ; i++){
// data part 1
data[i].x = 2.0f; // real part
data[i].y = 1.0f; // imag part
// data part 2
data_1[i].x = 4.0f;
data_1[i].y = 5.0f;
}
// transfer to GPU memory _
cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyHostToDevice);
cudaMemcpy(devPtr_1, data_1, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyHostToDevice);
/* executes FFT processes */
cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);
cufftExecC2C(plan_1, devPtr_1, devPtr_1, CUFFT_FORWARD);
/* executes FFT processes (inverse transformation) */
cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);
cufftExecC2C(plan_1, devPtr_1, devPtr_1, CUFFT_INVERSE);
/* transfer results from GPU memory */
cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyDeviceToHost);
cudaMemcpy(data_1, devPtr_1, sizeof(cufftComplex)*NX*NY*NZ, cudaMemcpyDeviceToHost);
/* deletes CUFFT plan */
cufftDestroy(plan);
cufftDestroy(plan_1);
/* frees GPU memory */
cudaFree(devPtr);
cudaFree(devPtr_1);
for(i = 0 ; i < NXNYNZ ; i += 10000){
cout << "data " << i << " "<< data[i].x << " "<< data[i].y << " "
<< "data_1 " << i << " "<< data_1[i].x << " "<< data_1[i].y
<< endl;
}
return 0;
}
[/codebox]
I add the cudaSetDevice(0); to insure that I am using the first device (c1060) not the integrated one (nForce 980a)
As far as I know; c1060 has 8 time the memory of the 9800 GT.
Any Idea what is going on?
these are the contents of main.h
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
cufftHandle plan, plan_1;
cufftComplex *devPtr, *devPtr_1; // pointers for data on the gpu