Hi,
I’m a beginner with CUDA, so maybe I forgot something important to do …
However I think this problem is very strange because I didn’t find anything similar on google.
When I start compiling it fails and returns me something strange :
[codebox]
nvcc --compiler-bindir=/usr/bin -I/opt/cuda/include -L/opt/cuda/lib64 -o training ./training.cpp
./training.cpp: In function ‘void MatrixProductKernel(float*, float*, float*, int)’:
./training.cpp:38: error: ‘threadIdx’ was not declared in this scope
./training.cpp:40: error: ‘threadIdx’ was not declared in this scope
./training.cpp: In function ‘void MatrixProductOnGPU(float*, float*, float*, int)’:
./training.cpp:63: error: expected primary-expression before ‘<’ token
./training.cpp:63: error: expected primary-expression before ‘>’ token
make: *** [all] Error 255
[/codebox]
I really think there is a strange problem somewhere because threadIdx should be found there, but I got this output instead.
But maybe it’s due to something else : line 63.
However I keep reading my code and I don’t find anything unusual for a cuda code.
Makefile :
[codebox]
all :
nvcc --compiler-bindir=/usr/bin -I/opt/cuda/include -L/opt/cuda/lib64 -o training ./training.cpp
clean :
rm -f ./training
[/codebox]
My tiny and unfinished source code :
[codebox]
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
global void MatrixProductKernel(float *a_device, float *b_device, float *c_device, int width) {
int i;
float c_value = 0.0;
for(i=0 ; i < width ; i++)
c_value += a_device[threadIdx.y * width + i] + b_device[i * width + threadIdx.x];
c_device[threadIdx.y * width + threadIdx.x] = c_value;
}
void MatrixProductOnGPU(float *a_host, float *b_host, float *c_host, int width) {
size_t size = width * width * sizeof(float);
float *a_device, *b_device, *c_device;
cudaMalloc((void **)a_device, size);
cudaMemcpy(a_device, a_host, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)b_device, size);
cudaMemcpy(b_device, b_host, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)c_device, size);
dim3 dim_grid(1, 1);
dim3 dim_block(width, width);
MatrixProductKernel<<< dim_grid, dim_block, 0, 0 >>>(a_device, b_device, c_device, width);
cudaMemcpy(c_host, c_device, size, cudaMemcpyDeviceToHost);
cudaFree(a_device);
cudaFree(b_device);
cudaFree(c_device);
}
int main(int argc, char **argv) {
std::cout << “Test 1” << std::endl;
return 0;
}
[/codebox]
Could you please tell me what’s wrong with my code ?
Thanks,
Alef B.