Hi Everyone,
I spent a lot time fixing the bug in the following Vector Addition application (There are both GPU and CPU computing in the code sample below). All of the output for global kernel function is 0! I am using CUDA Toolkit 3.2 and Driver 260.99. Graphic card is NVIDIA GTX480. OS: Win7, 64-bit.
I appreciate so much if anyone can provide me any hint where the bug is.
Thank you!
Jason
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <windows.h>
//#define VAR 100000000
#define VAR 8
global void vecAdd_d(float *A, float *B, float *C, int N) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if( tid < N)
C[tid] = A[tid] + B[tid];
}
void vecAdd_h(float *A, float *B, float *C, int N) {
for( int i = 0; i < N; i++){
C[i] = A[i] + B[i];
// printf(“%f\n”,C[i]);
}
}
__declspec(dllexport) int main() {
float *A_h;
float *B_h;
float *C_h;
float *A_d;
float *B_d;
float *C_d;
printf("**************GPU Processing****************\n\n");
printf("Device Initialization ...\n\n");
A_h = (float *) (malloc(sizeof(float) * VAR));
B_h = (float *) (malloc(sizeof(float) * VAR));
C_h = (float *) (malloc(sizeof(float) * VAR));
cudaMalloc( (void**)&A_d, sizeof(float) * VAR);
cudaMalloc( (void**)&B_d, sizeof(float) * VAR);
cudaMalloc( (void**)&C_d, sizeof(float) * VAR);
printf("Memory Allocation is Done!\n");
for (int i = 0; i < VAR; i++){
A_h[i] = 2.0;
B_h[i] = 2.0;
}
cudaMemcpy(A_d, A_h, sizeof(float) * VAR, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, sizeof(float) * VAR, cudaMemcpyHostToDevice);
printf("Data copied from Host to Device!\n");
int threadsPerBlock = 4;
int blocksPerGrid = (VAR + threadsPerBlock -1)/threadsPerBlock;
//dim2 dimBlock(threadsPerBlock, 1);
//dim2 dimGrid(blocksPerGrid, 1);
printf("Thread configuration is Done!\n\n");
// Kernel invocation
printf("Invoking Kernel functions...\n");
LARGE_INTEGER curFreq_d, curStart_d, curEnd_d;
QueryPerformanceFrequency(&curFreq_d);
QueryPerformanceCounter(&curStart_d);
//vecAdd_d<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, VAR);
vecAdd_d<<<blocksPerGrid, threadsPerBlock>>>(A_d, B_d, C_d, VAR);
QueryPerformanceCounter(&curEnd_d);
cudaMemcpy(C_h, C_d, sizeof(float) * VAR, cudaMemcpyDeviceToHost);
printf("Data copied from Device to Host!\n");
printf("Device adding result:\n");
for (int i = 0; i < VAR; i++)
printf("line: %f\n", C_h[i]);
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
printf("Device memory space is Freed!\n\n");
double time_d = (double)(curEnd_d.QuadPart-curStart_d.QuadPart)/curFreq_d.QuadPart;
printf("Device Executing Time: %f(ms)\n", time_d * 1000);
printf("**************GPU Processing is Done****************\n\n");
printf("**************CPU Processing****************\n\n");
LARGE_INTEGER curFreq_h, curStart_h, curEnd_h;
QueryPerformanceFrequency(&curFreq_h);
QueryPerformanceCounter(&curStart_h);
vecAdd_h(A_h, B_h, C_h, VAR);
QueryPerformanceCounter(&curEnd_h);
printf("Host adding result:\n");
for (int i = 0; i < VAR; i++)
printf("line: %f\n", C_h[i]);
double time_h = (double)(curEnd_h.QuadPart-curStart_h.QuadPart)/curFreq_h.QuadPart;
printf("Host Executing Time = %f(ms)\n", time_h * 1000);
printf("**************CPU Processing is Done****************\n\n");
printf("Vector Size = %d\n", VAR);
printf("Speedup = %f\n", time_h/time_d);
delete A_h;
delete B_h;
delete C_h;
return 0;
}