I’m struggling with getting the following CUDA code up to speed. The code works (i.e. it returns the right output) in both emulated and GPU mode. However the GPU binary is about 4 times slower than the emulated binary (20 seconds as compared to 5 seconds). All memory allocations on the GPU succeed in less than one second, so the added execution time of the GPU binary is in the kernel call itself. I don’t expect the GPU binary to be faster just because it is run on the GPU, since my code is probably not very well optimized, but I expect it to have execution times at least equal to the emulated binary… Is this a reasonable assumption? Is my kernel unfriendly in terms of GPU memory access? Am I missing something here?
__global__ void kernel(float* F, float* A, float* B, float* T, int M, int N) {
int v = threadIdx.x*M + blockIdx.x*blockDim.x*M;
if (v < M*N) {
for (int i = 0; i < M; i++) {
T[v+i] = 0;
for (int j = 0; j < M; j++) {
T[v+i] += B[i*M+j]*F[v+j];
}
}
for (int i = 0; i < M; i++) {
F[v+i] = F[v+i]*A[i]/T[v+i];
}
}
}
int main() {
int M = 752; // data per thread
int N = 3584; // number of threads
float* A = (float*) malloc(M*N*sizeof(float)); for (int i = 0; i < M*N; i++) { A[i] = 1; }
float* B = (float*) malloc(M*M*sizeof(float)); for (int i = 0; i < M*M; i++) { B[i] = 1; }
float* F = (float*) malloc(M*N*sizeof(float)); for (int i = 0; i < M*N; i++) { F[i] = 1; }
int blocksize = 512;
int gridsize = (int)ceil((float)N/(float)blocksize); // 7 in this example
std::cout << "block size: " << blocksize << " threads" << std::endl;
std::cout << "grid size: " << gridsize << " blocks" << std::endl;
float* A_d; cudaMalloc((void**) &A_d, M*N*sizeof(float));
float* B_d; cudaMalloc((void**) &B_d, M*M*sizeof(float));
float* F_d; cudaMalloc((void**) &F_d, M*N*sizeof(float));
float* T_d; cudaMalloc((void**) &T_d, M*N*sizeof(float));
cudaMemcpy(A_d, A, M*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, M*M*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(F_d, F, M*N*sizeof(float), cudaMemcpyHostToDevice);
kernel<<<gridsize, blocksize>>>(F_d, A_d, B_d, T_d, M, N);
CUT_CHECK_ERROR("kernel error: ");
cudaMemcpy(F, F_d, M*N*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(A_d);
cudaFree(B_d);
cudaFree(F_d);
cudaFree(T_d);
free(A);
free(B);
free(F);
return 0;
}
Some extra info: I’m using NVIDIA GeForce 8600 GTS and Cuda 2.1 on windows xp x64. The compiler is invoked as
nvcc -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin" -I "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include" -L "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\lib\amd64" -o main_gpu.exe main.cu
and
nvcc -deviceemu -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin" -I "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include" -L "C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\lib\amd64" -o main_gpu.exe main.cu
for device emulation.