Hello everyone,
I am new to CUDA. Now I am having a problem about assigning data to large size array in parallel.
The hardware I am using is Tesla c2070. I made a simple example to show the problem that I have.
This is the code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include <shrQATest.h>
#include <string.h>
#include <fstream>
#include <iostream>
__global__ void func1(float* A, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
{
if (i % 2 == 0)
A[i] = 3.5;
else
A[i] = 7.5;
}
}
int main(int argc, char** argv)
{
float *A_CPU, *A_GPU;
int len = 2500000;
A_CPU = new float[len*2];
memset(A_CPU, 0, len*8);
cudaMalloc((void**)&A_GPU, sizeof(cufftComplex)*len);
int threadsPerBlock = 256;
int blocksPerGrid = (len*2 + threadsPerBlock - 1) / threadsPerBlock;
func1<<<blocksPerGrid, threadsPerBlock>>>(A_GPU, len*2);
cudaMemcpy(A_CPU, A_GPU, len*8, cudaMemcpyDeviceToHost);
std::string resultFile = "/home/qdi_admin/Downloads/parallelResult";
std::ofstream f_result;
f_result.open((char*)resultFile.c_str(), std::ios::out | std::ios::binary);
if (!f_result.write ((char*)A_CPU, len*8))
{
std::cout << "Write-to File Error !!!!" << std::endl;
}
f_result.close();
}
This code is just to assign the complex value 3.5+7.5j to an vector which is of size 25000000.
There is no error or warning when I compile and run the code. Then I verify the output data in Matlab. The value of each element should be 3.5+7.5j. But many values are wrong. It may be some strange numbers like -1.9984e+18 - 1.9984e+18j
If I reduce the length of data from 25000000 to 2500000, the result will be all right.
Can anyone give me any advice?
Thanks a lot.