That set of nested loops is taking each element of derWeight
and adding exactly one product to it. It is the product of one element of error
multiplied by one element of input
. So the challenge is to spot the indexing pattern, then come up with a method by which the sub-indexes i
and j
can be computed from index
.
In a nutshell, the overall length of the affected portion of derWeight
must be equal to the product of input.size()
and error.size()
. If you’re not sure of this, you already have the code, concoct a simple test case and see how far the loops iterate.
Therefore if input.size()
is 2 and error.size()
is 3, then derWeight.size()
must be at least 6. In that case we have the following pattern by inspection:
index j k
derWeight[0] += error[0] * input[0]
derWeight[1] += error[0] * input[1]
derWeight[2] += error[1] * input[0]
derWeight[3] += error[1] * input[1]
derWeight[4] += error[2] * input[0]
derWeight[5] += error[2] * input[1]
From the above we can observe that:
j = index/input.size()
k = index%input.size()
Which gives us the following kernel code:
template <typename T1, typename T2, typename T3>
__global__ void k(T1 *derWeight, T2 *error, T3 *input, size_t input_length, size_t error_length){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < input_length*error_length)
derWeight[idx] += error[idx/input_length]*input[idx%input_length];
}
launch like so:
k<<<(derWeight.size()+511)/512, 512>>>(d_derWeight, d_error, d_input, input.size(), error.size());
Here is a complete example:
# cat t42.cu
#include <vector>
#include <iostream>
#include <algorithm>
#include <numeric>
template <typename T1, typename T2, typename T3>
__global__ void k(T1 *derWeight, T2 *error, T3 *input, size_t input_length, size_t error_length){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < input_length*error_length)
derWeight[idx] += error[idx/input_length]*input[idx%input_length];
}
int main(){
const int el = 3;
const int il = 4;
std::vector<int> derWeight(el*il);
std::vector<int> error(el);
std::vector<int> input(il);
std::fill(derWeight.begin(), derWeight.end(), 1);
std::iota(error.begin(), error.end(),0);
std::iota(input.begin(), input.end(),0);
for (int j = 0, index = 0; j < error.size(); j++) {
for (int k = 0; k < input.size(); k++, index++) {
derWeight[index] += error[j] * input[k];
}
}
for (int j = 0; j < derWeight.size(); j++) std::cout << derWeight[j] << " ";
std::cout << std::endl;
std::fill(derWeight.begin(), derWeight.end(), 1);
int *w, *e, *i;
cudaMalloc(&w, derWeight.size()*sizeof(int));
cudaMalloc(&e, error.size()*sizeof(int));
cudaMalloc(&i, input.size()*sizeof(int));
cudaMemcpy(w, derWeight.data(), derWeight.size()*sizeof(derWeight[0]), cudaMemcpyHostToDevice);
cudaMemcpy(e, error.data(), error.size()*sizeof(error[0]), cudaMemcpyHostToDevice);
cudaMemcpy(i, input.data(), input.size()*sizeof(input[0]), cudaMemcpyHostToDevice);
k<<<(derWeight.size()+511)/512, 512>>>(w, e, i, input.size(), error.size());
cudaMemcpy(derWeight.data(), w, derWeight.size()*sizeof(derWeight[0]), cudaMemcpyDeviceToHost);
for (int j = 0; j < derWeight.size(); j++) std::cout << derWeight[j] << " ";
std::cout << std::endl;
}
# nvcc -o t42 t42.cu
# compute-sanitizer ./t42
========= COMPUTE-SANITIZER
1 1 1 1 1 2 3 4 1 3 5 7
1 1 1 1 1 2 3 4 1 3 5 7
========= ERROR SUMMARY: 0 errors
#