__global__ void f(int* reorder_map, half* buffer, ..., int chunk_num, int all_num) {
__shared__ half* result[chunk_num];
// some code to calculate array result
// write back stage
int buffer_zone = blockIdx.x;
int chunk_zone = blockIdx.y;
int offset = all_num * buffer_zone + chunk_num * chunk_zone;
for (int i = 0; i < chunk_num; i++) { // this is for all thread, not for single thread, I wrote it just for simplify the question
buffer[offset + reorder_map[offset + i]] = result[i];
}
}
__global__ void reduce(half* buffer, half* dst, int split_num, int reduce_chunk_num, int all_num) {
int result = 0;
int reduce_chunk_zone = blockIdx.x;
int add_idx = threadIdx.x;
int offset = reduce_chunk_zone * reduce_chunk_num + add_idx;
for (int i = 0; i < split_num; i++) {
result += buffer[all_num * i + offset];
}
dst[offset] = result;
}
void split_sum(int* reorder_map, half* buffer, half* dst, int split_num, int all_num) {
int chunk_num = ...;
dim3 threadblock(..., ...);
dim3 numBlocks(split_num, ceil(all_num / chunk_num));
f<<<numBlocks, threadPerBlock>>>(reorder_map, buffer, ..., chunk_num, all_num);
int reduce_chunk_num = ...;
dim3 threadblock(reduce_chunk_num);
dim3 numBlocks(ceil(all_num / reduce_chunk_num));
reduce<<<numBlocks, threadPerBlock>>>(buffer, dst, split_num, reduce_chunk_num, all_num);
}
I wonder if there are any other way to rewrite this 2 kernel to accelarate the whole code clip?
Here are some more information about the parameters use in the upper code
1 <= split_num <= 6;
reorder_map[all_num * buffer_zone + chunk_num * chunk_zone + i] (0 <= i < chunk_num) is an rearrange of [all_num * buffer_zone + chunk_num * chunk_zone + 0, all_num * buffer_zone + chunk_num * chunk_zone + chunk_num);
No need to worry about Out of Memory;
GPU is A100
for each chunk, all data the calculation need can be perfectly cached by L2 cache(use less then 40MB data for one chunk)
I wonder if there are any way to optimizing this code clip, as there is not relationship between different chunks. So is it possible to launch reduce function for chunk i while f function is calculation for chunk i+1? Like to write host code like this?
void split_sum() {
// calculate for all split of chunk 0
// calculate for all split of chunk 1, reduce for chunk 0
// calculate for all split of chunk 2, reduce for chunk 1
// calculate for all split of chunk 3, reduce for chunk 2
// ...
}
For the reduce kernel, it looks to me that you want to compute per-column sums within a matrix.
If you can pad each row to 4 bytes, you could process two consecutive columns per thread by using half2 instead of half.
With padding to 8 or 16 bytes, you could process 4 or 8 consecutive columns with half2 computations, but using vectorized memory instructions.