It is not clear how to implement the index

I’m a beginner and it’s hard for me how to write in Cuda.
Please help rewrite the following code for Cuda kernel. I have three vectors: derWeight, error, input.

for (int j = 0, index = 0; j < error.size(); j++) {
for (int k = 0; k < input.size(); k++, index++) {
derWeight[index] += error[j] * input[k];
}
}

That set of nested loops is taking each element of derWeight and adding exactly one product to it. It is the product of one element of error multiplied by one element of input. So the challenge is to spot the indexing pattern, then come up with a method by which the sub-indexes i and j can be computed from index.

In a nutshell, the overall length of the affected portion of derWeight must be equal to the product of input.size() and error.size(). If you’re not sure of this, you already have the code, concoct a simple test case and see how far the loops iterate.

Therefore if input.size() is 2 and error.size() is 3, then derWeight.size() must be at least 6. In that case we have the following pattern by inspection:

       index          j          k
derWeight[0] += error[0] * input[0]
derWeight[1] += error[0] * input[1]
derWeight[2] += error[1] * input[0]
derWeight[3] += error[1] * input[1]
derWeight[4] += error[2] * input[0]
derWeight[5] += error[2] * input[1]

From the above we can observe that:

j = index/input.size()
k = index%input.size()

Which gives us the following kernel code:

template <typename T1, typename T2, typename T3>
__global__ void k(T1 *derWeight, T2 *error, T3 *input, size_t input_length, size_t error_length){
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < input_length*error_length)
      derWeight[idx] += error[idx/input_length]*input[idx%input_length];
}

launch like so:

k<<<(derWeight.size()+511)/512, 512>>>(d_derWeight, d_error, d_input, input.size(), error.size());

Here is a complete example:

# cat t42.cu
#include <vector>
#include <iostream>
#include <algorithm>
#include <numeric>

template <typename T1, typename T2, typename T3>
__global__ void k(T1 *derWeight, T2 *error, T3 *input, size_t input_length, size_t error_length){
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < input_length*error_length)
      derWeight[idx] += error[idx/input_length]*input[idx%input_length];
}

int main(){
  const int el = 3;
  const int il = 4;
  std::vector<int> derWeight(el*il);
  std::vector<int> error(el);
  std::vector<int> input(il);
  std::fill(derWeight.begin(), derWeight.end(), 1);
  std::iota(error.begin(), error.end(),0);
  std::iota(input.begin(), input.end(),0);
  for (int j = 0, index = 0; j < error.size(); j++) {
    for (int k = 0; k < input.size(); k++, index++) {
      derWeight[index] += error[j] * input[k];
    }
  }
  for (int j = 0; j < derWeight.size(); j++) std::cout << derWeight[j] << " ";
  std::cout << std::endl;

  std::fill(derWeight.begin(), derWeight.end(), 1);
  int *w, *e, *i;
  cudaMalloc(&w, derWeight.size()*sizeof(int));
  cudaMalloc(&e, error.size()*sizeof(int));
  cudaMalloc(&i, input.size()*sizeof(int));
  cudaMemcpy(w, derWeight.data(), derWeight.size()*sizeof(derWeight[0]), cudaMemcpyHostToDevice);
  cudaMemcpy(e, error.data(), error.size()*sizeof(error[0]), cudaMemcpyHostToDevice);
  cudaMemcpy(i, input.data(), input.size()*sizeof(input[0]), cudaMemcpyHostToDevice);
  k<<<(derWeight.size()+511)/512, 512>>>(w, e, i, input.size(), error.size());
  cudaMemcpy(derWeight.data(), w, derWeight.size()*sizeof(derWeight[0]), cudaMemcpyDeviceToHost);
  for (int j = 0; j < derWeight.size(); j++) std::cout << derWeight[j] << " ";
  std::cout << std::endl;
}
# nvcc -o t42 t42.cu
# compute-sanitizer ./t42
========= COMPUTE-SANITIZER
1 1 1 1 1 2 3 4 1 3 5 7
1 1 1 1 1 2 3 4 1 3 5 7
========= ERROR SUMMARY: 0 errors
#

Thank you very much for your help.
It especially helped:
j = index/input.size()
k = index%input.size()

I’m using Java and calling CUDA.
The calculation results coincided.
My code:

“extern "C"\n” +
global void derivativeWeight(const float* restrict input, const float* restrict error, float* derWeight, int rows, int columns)\n” +
“{\n” +
" unsigned idx = threadIdx.x + blockDim.x * blockIdx.x;\n" +
" if (idx < rows * columns) {\n" +
" derWeight[idx] += error[idx / columns] * input[idx % columns];\n" +
" }\n" +
“}\n” +

int row = error.size();
int column = input.size();
int n = row * column;
CUfunction function = new CUfunction();
cuModuleGetFunction(function, helperModule, “derivativeWeight”);
Pointer kernelParameters = Pointer.to(Pointer.to(input.getData_gpu()), Pointer.to(error.getData_gpu()), Pointer.to(derWeight.getData_gpu()), Pointer.to(new int{row}), Pointer.to(new int{column}));
int blockSizeX = (int) Math.min(n, Math.pow(BLOCK_SIZE, 1));
int gridSizeX = (int) Math.ceil((double) n / blockSizeX);

cuLaunchKernel(function,
gridSizeX, 1, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
if (Use.DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();