Hello,
I do not understand why my kernel is not doing the appropriate computations. I allocated a pinned memory on the host and wrote a very stupid kernel that takes the pinned memory input and add 2.0 to each elements in the pinned memory variable. I do this using a serialization of kernels (this is in fact a test case for a more complicated case that I would like to solve) for a number of times (corresponding to NBLOCKS). It seems that every input in the pinned variable should be increased by NBLOCKS*2 (in the case here +6). The output of the function is wrong… Can someone help me please? I really need to make this working otherwise I will have to use a traditional parallel processing using CPU.
Here is the code:
/* --------------------------- header secton ----------------------------*/
#include<stdio.h>
#include<cuda.h>
#define NNEUR 10
#define NVarNEUR 4
#define NBLOCKS (int) 3
/* --------------------------- target code ------------------------------*/
__global__ void TEST(float *IN)
{
int idx=threadIdx.x;
int i;
i=0;
float yout[NVarNEUR];
for (i=0;i<NVarNEUR;i++) yout[i]=IN[idx*NVarNEUR+i];
for (i=0;i<NVarNEUR;i++) {
IN[idx*NVarNEUR+i]=yout[i]+2.0;
}
__syncthreads();
}
/* --------------------------- host code ------------------------------*/
int main (int argc, char *argv[])
{
int i,j;
float *IN,*IN_d;
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaHostAlloc((float **)&IN, sizeof(float)*NVarNEUR*NNEUR, cudaHostAllocMapped);
for (i=0;i<(NNEUR);i++) {
IN[i*NVarNEUR]=1.0;
IN[i*NVarNEUR+1]=2.0;
IN[i*NVarNEUR+2]=3.0;
IN[i*NVarNEUR+3]=4.0;
}
cudaHostGetDevicePointer(&IN_d, IN, 0);
// allocate and initialize an array of stream handles and events
cudaStream_t *streams = (cudaStream_t*) malloc((NBLOCKS) * sizeof(cudaStream_t));
for(i = 0; i < (NBLOCKS); i++) cudaStreamCreate(&(streams[i]));
cudaEvent_t *kernelEvent;
kernelEvent = (cudaEvent_t*) malloc(NBLOCKS * sizeof(cudaEvent_t));
for(int i = 0; i < NBLOCKS; i++) cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming);
TEST<<<1,NNEUR,0,streams[0]>>>(IN_d);
cudaEventRecord(kernelEvent[0], streams[0]);
for (i=1;i<NBLOCKS;i++){
TEST<<<1,NNEUR,0,streams[i]>>>(IN_d);
cudaEventRecord(kernelEvent[i], streams[i]);
cudaStreamWaitEvent(streams[i], kernelEvent[i-1],0);
}
for (i=0;i<NNEUR;i++) {
for (j=0;j<NVarNEUR;j++) printf("IN: %f \t\t",IN[i*NVarNEUR+j]);
printf("\n");
}
cudaDeviceSynchronize();
for(int i = 0; i < NBLOCKS; i++) {
cudaStreamDestroy(streams[i]);
cudaEventDestroy(kernelEvent[i]);
}
cudaFreeHost(IN);
}
And the results:
IN: 3.000000 IN: 6.000000 IN: 5.000000 IN: 6.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
IN: 7.000000 IN: 6.000000 IN: 7.000000 IN: 8.000000
Thank you for helping me…
Pi-r