Hi,
I’m having some issues launching kernels without using the NVCC specific <<< >>> chevron notation. Here’s a simple example:
#include <stdio.h>
#include <cuda_runtime.h>
#define NVCC_LAUNCH
__global__ void testKernel(int* output, int* input, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
output[i] = input[i];
}
int main(int argc, char** argv)
{
cudaSetDevice(0);
int N = 256;
size_t size = N * sizeof(int);
int* h_input = new int[N];
int* h_output = new int[N];
int* d_input;
int* d_output;
cudaMalloc((void**)&d_input, size);
cudaMalloc((void**)&d_output, size);
for (int i = 0; i < N; i++) h_input[i] = 10;
cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
cudaMemset(d_output, 0, size);
int blocks = 1;
int threads = 256;
dim3 gridDim(blocks, 1, 1);
dim3 blockDim(threads, 1, 1);
#ifdef NVCC_LAUNCH
testKernel<<<gridDim, blockDim>>>(d_output, d_input, N);
#else
cudaConfigureCall(gridDim, blockDim, 0);
size_t offset = 0;
cudaSetupArgument(d_output, offset);
offset += sizeof(d_output);
cudaSetupArgument(d_input, offset);
offset += sizeof(d_input);
cudaSetupArgument(N, offset);
cudaLaunch("testKernel");
#endif
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("ERROR: %s\n", cudaGetErrorString(err));
cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
printf("output[0] = %d\n", h_output[0]);
printf("output[127] = %d\n", h_output[127]);
printf("output[255] = %d\n", h_output[255]);
cudaFree(d_input);
cudaFree(d_output);
delete[] h_input;
delete[] h_output;
return 0;
}
I compile this example like this (x86_64 linux):
/apps/Linux64/cuda/cuda-3.2/bin/nvcc -m64 -arch sm_21 -o main main.cu
Which works perfectly with the NVCC_LAUNCH option enabled:
output[0] = 10
output[127] = 10
output[255] = 10
but when I disable this, I get the “invalid device function” warning:
ERROR: invalid device function
output[0] = 0
output[127] = 0
output[255] = 0
Can anyone spot any errors in this example or point to why this might not be working?
I’m using a Fermi GPU (tried both Tesla C2050 and Quadro 4000) with CUDA 3.2 and sm_21.
Thanks,
Dan