I guess what I am trying to do is to take some 2-d arrays and do some adding and inserting of data at specific array elements
Currently I have this kernel:
// kernel that adds noise and initialize other arrays
__global__ void init_arrays(double coded_bits_d,
double received_bits_d,
double L_r_d,
double L_v_d,
double *N_0_d,
double *SNR_d)
{
int i = threadIdx.x;
int j = threadIdx.y;
//just add noise per SNR
// this loop gets all of the symbol bits into received bits
double noise, E;
for(int i=tidi;i<SIZEY;i++) {
for(int j=tidj;j<SIZEX;j++) {
noise = GaussianRV(N_0_d);
E = ((*SNR_d)*(*N_0_d)*4)/8;
received_bits_d[i][j] = sqrt(E)*coded_bits_d[i][j] + noise;
L_r_d[i][j] = 4*received_bits_d[i][j]*(*SNR_d);
L_v_d[i][j] = 0;
}
}
}
Here is the GaussianRV kernel for reference
__device__ double GaussianRV(double *N_0)
{
double v1,v2;
double w;
double y1;
double y2;
int use_last = 0;
if (use_last == 0)
{
do
{
v1 = 2.0*uniformRV() - 1.0;
v2 = 2.0*uniformRV() - 1.0;
w = v1*v1 + v2*v2;
}while (w >= 1.0);
w = sqrt((-2.0*log(w))/w);
y1 = v1*w;
y2 = v2*w;
use_last = 1;
}
else
{
y1 = y2;
use_last = 0;
}
return (y1*sqrt(*N_0/2));
}
But the code just seg faults.
This is the way I am declaring those arrays
//create 2-d arrays on CUDA
double *received_bits_d, *L_v_d, *L_h_d, *L_r_d, *decoded_bits_d;
//received_bits_d
cudaMalloc((void**)&received_bits_d, sizeof(double) * SIZEY * SIZEX);
//L_v_d
cudaMalloc((void**)&L_v_d, sizeof(double) * SIZEY * SIZEX);
//L_h_d
cudaMalloc((void**)&L_h_d, sizeof(double) * SIZEY * SIZEX);
//L_r_d
cudaMalloc((void**)&L_r_d, sizeof(double) * SIZEY * SIZEX);
//decoded_bits_d
cudaMalloc((void**)&decoded_bits_d, sizeof(double) * SIZEY * SIZEX);
and this is how I copied the original bits into the coded_bits_d array
//create the matrix populated with a sample coded bits //////////
// now using dynamically allocated arrays with pointers
double **coded_bits_h;
coded_bits_h = new double*;
for(i = 0; i < SIZEY; i++) {
coded_bits_h[i] = new double ;
}
...some data insertion into array...
//create a 2-d array on CUDA
double *coded_bits_d;
cudaMalloc((void**)&coded_bits_d, sizeof(double) * SIZEY * SIZEX);
//copy the contents of the coded bits from host to device
cudaMemcpy(coded_bits_d, coded_bits_h, sizeof(double) * SIZEY * SIZEX, cudaMemcpyHostToDevice);
and here is how I am calling the kernel:
//call CUDA to generate the packet array
init_arrays<<<BLOCK_NUM, THREAD_NUM>>>((double(*) )coded_bits_d,
(double(*) )received_bits_d,
(double(*) )L_r_d,
(double(*) )L_v_d,
N_0_d, SNR_d);
//error check for kernel
if (cudaGetLastError() != cudaSuccess) {
cout << "kernel launch failed\n";
}
cudaThreadSynchronize();
if (cudaGetLastError() != cudaSuccess) {
cout << "kernel execution failed\n";
}
While I am not trying to get my stuff just coded for me, I am really lost as to how to make use of the GPU. There is no error message so I am assuming the kernel launches and runs ok, but the program runs a bit and then seg faults. I am thinking I am not really using parallel computing techniques and making the kernel do something that is ordinarily done on the CPU. Not sure. But I am thinking it must be something to do with how i am declaring, allocating, and using the 2-d arrays. Most of the reference material online are 1-d arrays, should I be considering re-doing my code to use only 1-d arrays? i rather keep it 2-d but theres not that much documentation on using 2-d with CUDA.
I am especially confused on how to index the 2-d arrays with blockIdx.x, blockDim.x,threadIdx.x
Please Help!