float *cpuA;
float *cpuB;
float *cpuC;
float *gpuA;
float *gpuB;
float *gpuC;
float *meanVectorGPU;
float *meanVectorCPU;
int i, j;
int nRows = 3;
int nColumns = 3;
dim3 threads2(nColumns);
dim3 grid2(nColumns);
dim3 threads(nRows,nColumns);
dim3 grid(nRows,nColumns);
unsigned int timer = 0;
unsigned int elapsed = 0;
//cublasStatus status;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
CUT_DEVICE_INIT();
cpuA = (float*) malloc (nRows*nColumns*sizeof(float));
cpuB = (float*) malloc (nColumns*nColumns*sizeof(float));
cpuC = (float*) malloc (sizeof(float));
cudaMalloc((void **)&gpuA, nRows*nColumns*sizeof(float));
cudaMalloc((void **)&gpuB, nColumns*nColumns*sizeof(float));
cudaMalloc((void **)&gpuC, nColumns*nColumns*sizeof(float));
cudaMalloc((void **)&meanVectorGPU, nColumns*sizeof(float));
initMatrix(cpuA,cpuB,cpuC,nRows,nColumns);
for(i = 0; i < nRows; i++){
for(j = 0; j < nColumns; j++){
printf("%f ", cpuA[j+i*nColumns]);
}
printf("\n");
}
printf("\n");
cudaFree(gpuC);
free(cpuC);
cudaMemcpy(gpuA, cpuA, nRows*nColumns*sizeof(float), cudaMemcpyHostToDevice);
meanVectorCPU = (float*) malloc (nColumns*sizeof(float));
memset(meanVectorCPU, 0, nColumns*sizeof(float));
extractMeanVector<<<nColumns, nColumns>>>(meanVectorGPU, gpuA, nRows, nColumns);
CUT_CHECK_ERROR("Kernel execution failed");
cudaThreadSynchronize();
cudaMemcpy(meanVectorCPU, meanVectorGPU, nColumns*sizeof(float), cudaMemcpyDeviceToHost);
printf("MeanVector \n \n");
for(i = 0; i < nColumns; i++){
printf(" %f ", meanVectorCPU[i]);
}
printf("\n \nNormalized Matrix \n \n");
normalizeMatrix<<<1, 9>>>(gpuA, meanVectorGPU, nRows, nColumns);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Kernel execution failed");
cudaMemcpy(cpuA, gpuA, nColumns*sizeof(float), cudaMemcpyDeviceToHost);
for(i = 0; i < nRows; i++){
for(j = 0; j < nColumns; j++){
printf("%f ", cpuA[j+i*nColumns]);
}
printf("\n");
}
That is the code up to the problem point calling my kernel.
global void normalizeMatrix(float* gpuA, float* meanVector, int nRows, int nColumns){
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
float temp = 0;
float temp2 = 0;
float final, final2;
gpuA[ix] = 2;
}
that is the modified normalizeMatrix, which still causes only the first row to be set to 0 and all other rows stay the same as they were before. There are no errors that appear up to this point, using CUT_CHECK_ERROR after my kernel executions. Currently I am using 2 kernels, but in the future I’ll hopefully be switching to 1 to minimize communication.