# It is not clear how to implement the index

I’m a beginner and it’s hard for me how to write in Cuda.
Please help rewrite the following code for Cuda kernel. I have three vectors: derWeight, error, input.

for (int j = 0, index = 0; j < error.size(); j++) {
for (int k = 0; k < input.size(); k++, index++) {
derWeight[index] += error[j] * input[k];
}
}

That set of nested loops is taking each element of `derWeight` and adding exactly one product to it. It is the product of one element of `error` multiplied by one element of `input`. So the challenge is to spot the indexing pattern, then come up with a method by which the sub-indexes `i` and `j` can be computed from `index`.

In a nutshell, the overall length of the affected portion of `derWeight` must be equal to the product of `input.size()` and `error.size()`. If you’re not sure of this, you already have the code, concoct a simple test case and see how far the loops iterate.

Therefore if `input.size()` is 2 and `error.size()` is 3, then `derWeight.size()` must be at least 6. In that case we have the following pattern by inspection:

``````       index          j          k
derWeight[0] += error[0] * input[0]
derWeight[1] += error[0] * input[1]
derWeight[2] += error[1] * input[0]
derWeight[3] += error[1] * input[1]
derWeight[4] += error[2] * input[0]
derWeight[5] += error[2] * input[1]
``````

From the above we can observe that:

``````j = index/input.size()
k = index%input.size()
``````

Which gives us the following kernel code:

``````template <typename T1, typename T2, typename T3>
__global__ void k(T1 *derWeight, T2 *error, T3 *input, size_t input_length, size_t error_length){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < input_length*error_length)
derWeight[idx] += error[idx/input_length]*input[idx%input_length];
}
``````

launch like so:

``````k<<<(derWeight.size()+511)/512, 512>>>(d_derWeight, d_error, d_input, input.size(), error.size());
``````

Here is a complete example:

``````# cat t42.cu
#include <vector>
#include <iostream>
#include <algorithm>
#include <numeric>

template <typename T1, typename T2, typename T3>
__global__ void k(T1 *derWeight, T2 *error, T3 *input, size_t input_length, size_t error_length){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < input_length*error_length)
derWeight[idx] += error[idx/input_length]*input[idx%input_length];
}

int main(){
const int el = 3;
const int il = 4;
std::vector<int> derWeight(el*il);
std::vector<int> error(el);
std::vector<int> input(il);
std::fill(derWeight.begin(), derWeight.end(), 1);
std::iota(error.begin(), error.end(),0);
std::iota(input.begin(), input.end(),0);
for (int j = 0, index = 0; j < error.size(); j++) {
for (int k = 0; k < input.size(); k++, index++) {
derWeight[index] += error[j] * input[k];
}
}
for (int j = 0; j < derWeight.size(); j++) std::cout << derWeight[j] << " ";
std::cout << std::endl;

std::fill(derWeight.begin(), derWeight.end(), 1);
int *w, *e, *i;
cudaMalloc(&w, derWeight.size()*sizeof(int));
cudaMalloc(&e, error.size()*sizeof(int));
cudaMalloc(&i, input.size()*sizeof(int));
cudaMemcpy(w, derWeight.data(), derWeight.size()*sizeof(derWeight[0]), cudaMemcpyHostToDevice);
cudaMemcpy(e, error.data(), error.size()*sizeof(error[0]), cudaMemcpyHostToDevice);
cudaMemcpy(i, input.data(), input.size()*sizeof(input[0]), cudaMemcpyHostToDevice);
k<<<(derWeight.size()+511)/512, 512>>>(w, e, i, input.size(), error.size());
cudaMemcpy(derWeight.data(), w, derWeight.size()*sizeof(derWeight[0]), cudaMemcpyDeviceToHost);
for (int j = 0; j < derWeight.size(); j++) std::cout << derWeight[j] << " ";
std::cout << std::endl;
}
# nvcc -o t42 t42.cu
# compute-sanitizer ./t42
========= COMPUTE-SANITIZER
1 1 1 1 1 2 3 4 1 3 5 7
1 1 1 1 1 2 3 4 1 3 5 7
========= ERROR SUMMARY: 0 errors
#
``````

Thank you very much for your help.
It especially helped:
j = index/input.size()
k = index%input.size()

I’m using Java and calling CUDA.
The calculation results coincided.
My code:

“extern "C"\n” +
global void derivativeWeight(const float* restrict input, const float* restrict error, float* derWeight, int rows, int columns)\n” +
“{\n” +
" unsigned idx = threadIdx.x + blockDim.x * blockIdx.x;\n" +
" if (idx < rows * columns) {\n" +
" derWeight[idx] += error[idx / columns] * input[idx % columns];\n" +
" }\n" +
“}\n” +

int row = error.size();
int column = input.size();
int n = row * column;
CUfunction function = new CUfunction();
cuModuleGetFunction(function, helperModule, “derivativeWeight”);
Pointer kernelParameters = Pointer.to(Pointer.to(input.getData_gpu()), Pointer.to(error.getData_gpu()), Pointer.to(derWeight.getData_gpu()), Pointer.to(new int{row}), Pointer.to(new int{column}));
int blockSizeX = (int) Math.min(n, Math.pow(BLOCK_SIZE, 1));
int gridSizeX = (int) Math.ceil((double) n / blockSizeX);

cuLaunchKernel(function,
gridSizeX, 1, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);