Hello everyone,
I’m trying to separate my kernels into multiple .cu files, in order to fine-tune the nvcc compile options per kernel. Since they all depend on the same constant datastructures, I must call cudaMemcpyToSymbol for each file. However, this does not work as expected: when launching the kernels, all datastructures are empty. Only one is correctly copied, which is the last object-file passed to the linker. It seems that although the constant declarations cannot be referenced externally, they also cannot have equal names.
The following code can be used to reproduce the problem:
kernel1.cu:
[codebox]#include <stdio.h>
constant float value;
constant float value_kernel1;
extern “C” void set_values_kernel1(float value)
{
cudaMemcpyToSymbol("value", &value, sizeof(float), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol("value_kernel1", &value, sizeof(float), 0, cudaMemcpyHostToDevice);
}
global void kernel1(float *data)
{
if (threadIdx.x == 0 && threadIdx.y == 0)
{
data[0] = value;
data[1] = value_kernel1;
}
}
extern “C” void launch_kernel1(void)
{
float value[2];
float *devPtr;
cudaMalloc((void **)&devPtr, sizeof(value));
dim3 grid(1);
dim3 block(32);
kernel1<<<grid, block>>>(devPtr);
cudaThreadSynchronize();
cudaMemcpy(&value, devPtr, sizeof(value), cudaMemcpyDeviceToHost);
printf("kernel1, __constant__ value: %f\n", value[0]);
printf("kernel1, __constant__ value_kernel1: %f\n", value[1]);
cudaFree(devPtr);
}
[/codebox]
kernel2.cu:
[codebox]#include <stdio.h>
constant float value;
constant float value_kernel2;
extern “C” void set_values_kernel2(float value)
{
cudaMemcpyToSymbol("value", &value, sizeof(float), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol("value_kernel2", &value, sizeof(float), 0, cudaMemcpyHostToDevice);
}
global void kernel2(float *data)
{
if (threadIdx.x == 0 && threadIdx.y == 0)
{
data[0] = value;
data[1] = value_kernel2;
}
}
extern “C” void launch_kernel2(void)
{
float value[2];
float *devPtr;
cudaMalloc((void **)&devPtr, sizeof(value));
dim3 grid(1);
dim3 block(32);
kernel2<<<grid, block>>>(devPtr);
cudaThreadSynchronize();
cudaMemcpy(&value, devPtr, sizeof(value), cudaMemcpyDeviceToHost);
printf("kernel2, __constant__ value: %f\n", value[0]);
printf("kernel2, __constant__ value_kernel2: %f\n", value[1]);
cudaFree(devPtr);
}
[/codebox]
main.cpp:
[codebox]extern “C” void set_values_kernel1(float value);
extern “C” void set_values_kernel2(float value);
extern “C” void launch_kernel1(void);
extern “C” void launch_kernel2(void);
int main(int argc, char *argv)
{
set_values_kernel1(0.5f);
set_values_kernel2(0.7f);
launch_kernel1();
launch_kernel2();
}
[/codebox]
To build the test-program, issue:
nvcc -c kernel1.cu
nvcc -c kernel2.cu
nvcc main.cpp kernel1.o kernel2.o
Running the program results in an incorrect output (these values should be equal kernel1: 0.5, kernel2: 0.7):
kernel1, constant value: 0.000000
kernel1, constant value_kernel1: 0.500000
kernel2, constant value: 0.700000
kernel2, constant value_kernel2: 0.700000
When linking kernel2 before kernel1, the output becomes:
nvcc main.cpp kernel2.o kernel1.o
kernel1, constant value: 0.700000
kernel1, constant value_kernel1: 0.500000
kernel2, constant value: 0.000000
kernel2, constant value_kernel2: 0.700000
It seems, that the constant value of kernel1 is overwritten by the third cudaMemcpyToSymbol call (using the value 0.7).
The platform I’m using is Ubuntu 9.04 32bit, CUDA 2.3, GeForce 8800M GTX.
Is this normal behaviour, or should it be considered a bug?