Consider the following ways of performing `fill`

on a float array.

- Method1: Each thread performs a single set operation.
- Method 2: On top of method 1, replace scalar store with vector store.
- Method 3: Launch a minimum number of threads that saturate the GPU. Each thread performs as many set operations as necessary.
- Method 4: On top of method 3, replace scalar store with vector store.
- Method 5: Use CUDA driver API cuMemsetD32 for 32-bit set.
- Method 6: Use thrust API.

Here’s the implementation:

```
// nvcc Memset.cu -lcuda -o Memset
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_ptr.h>
#include <thrust/fill.h>
__global__ void fill1(float* a, size_t num, float value) {
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
if (gtid < num) {
a[gtid] = value;
}
}
__global__ void fill2(float* a, size_t num, float value) {
float4* aAlt = reinterpret_cast<float4*>(a);
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
size_t numFloor = num / 4 * 4;
size_t c = numFloor / 4;
if (gtid < c) {
aAlt[gtid] = make_float4(value, value, value, value);
}
if (gtid == 0) {
for (int i = 0; i < num - numFloor; ++i) {
a[num - 1 - i] = value;
}
}
}
__global__ void fill3(float* a, size_t num, float value) {
int numThreads = blockDim.x * gridDim.x;
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = gtid; i < num; i += numThreads) {
a[i] = value;
}
}
__global__ void fill4(float* a, size_t num, float value) {
float4* aAlt = reinterpret_cast<float4*>(a);
int numThreads = blockDim.x * gridDim.x;
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
size_t numFloor = num / 4 * 4;
size_t c = numFloor / 4;
for (int i = gtid; i < c; i += numThreads) {
aAlt[i] = make_float4(value, value, value, value);
}
if (gtid == 0) {
for (int i = 0; i < num - numFloor; ++i) {
a[num - 1 - i] = value;
}
}
}
int main() {
cudaSetDevice(0);
std::size_t num = 200'000'000ULL;
float* a_d{};
cudaMalloc(&a_d, sizeof(float) * num);
int blockSize{};
int gridSize{};
float value = 1.0f;
int loopCount = 5;
// Method1: Each thread performs a single set operation.
for (int i = 0; i < loopCount; ++i) {
cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, fill1);
gridSize = (num + blockSize - 1) / blockSize;
fill1<<<gridSize, blockSize>>>(a_d, num, value);
}
// Method 2: On top of method 1, replace scalar store with vector store.
for (int i = 0; i < loopCount; ++i) {
cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, fill2);
size_t numFloor = num / 4 * 4;
size_t c = numFloor / 4;
gridSize = (c + blockSize - 1) / blockSize;
fill2<<<gridSize, blockSize>>>(a_d, num, value);
}
// Method 3: A minimum number of threads are launched to saturate the GPU.
// Each thread performs as many number of set operations as needed.
for (int i = 0; i < loopCount; ++i) {
cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, fill3);
fill3<<<gridSize, blockSize>>>(a_d, num, value);
}
// Method 4: On top of method 3, replace scalar store with vector store.
for (int i = 0; i < loopCount; ++i) {
cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, fill4);
fill4<<<gridSize, blockSize>>>(a_d, num, value);
}
// Method 5: Use CUDA driver API cuMemsetD32 for 32-bit set.
for (int i = 0; i < loopCount; ++i) {
cuMemsetD32(reinterpret_cast<CUdeviceptr>(a_d), *reinterpret_cast<unsigned int*>(&value),
num);
}
// Method 6: Use thrust API.
for (int i = 0; i < loopCount; ++i) {
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(a_d);
thrust::fill(dev_ptr, dev_ptr + num, value);
}
cudaDeviceSynchronize();
cudaFree(a_d);
}
```

Below is the performance result on a 3080 Ti (Ampere, CC=8.6). Understandably, `cuMemsetD32`

and thrust have the top performance. But why does `fill3`

, which is supposed to reduce the block scheduling overhead by launching the minimal amount of threads that saturate the device, have worse performance than the vanilla `fill1`

? In particular, `fill1`

achieves a memory throughput of 97% and has 25.27 warp cycles per issued instruction. These numbers become worse for `fill3`

, 86% and 73.86.

```
Total Time Avg
4.607 ms 921.342 micro-s fill1
4.677 ms 935.300 micro-s fill2
5.135 ms 1.027 ms fill3
4.704 ms 940.843 micro-s fill4
4.660 ms 932.023 micro-s [CUDA memset]
4.619 ms 923.857 micro-s thrust
```

The second question is why using the vector store in place of scalar store (as in `fill2`

) has no effect on `fill1`

? It looks as if `fill1`

is bottlenecked by the outstanding memory requests alone, and that completely offsets a reduction in the number of memory instructions.