Hello, I want to implement CDP for a basic forward function (I will call the forward function to many times at the same time (also from a cuda function) and because of that I want to use CDP)
Here’s the code that I’m trying to run;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
cudaDeviceSynchronize();
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
I also tried to run CDP from a device function like this but still gave me the same error;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}
And also tried cudaLaunchKernel
function and using global instead of device but they didn’t work either. I’m using -rdc=true flag too and also my arch is sm_75 which should support CDP