As Programming Manual says “above 48 KB requires dynamic shared memory”, I test on RTX 4090D with static allocated 64KB shared memory. However, the following kernel run without any failure or warning and print expected result. Do I miss something? It seems static method supports to allocate more than 48KB ?
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
__global__ void func(float* output0, float* input0) {
constexpr int SIZE = 25344; // 4B * 25344 = 101376B = 99KB
__shared__ float data[SIZE]; // static allocation for 99KB
data[0] = 0.0f;
if (threadIdx.x == 0 ) {
for(int i = 1 ; i < SIZE; i++) {
data[i] = data[i-1] + 1.0f;
}
}
__syncthreads();
output0[threadIdx.x] = input0[threadIdx.x] + data[SIZE - 1 - threadIdx.x];
}
int main() {
const int N = 512;
const int block_size = 512;
const int grid_size = 10;
float *h_input = new float[N];
float *h_output = new float[N];
for (int i = 0; i < N; i++) {
h_input[i] = 1;
h_output[i] = 0;
}
float *d_input, *d_output;
cudaMalloc(&d_input, N * sizeof(float));
cudaMalloc(&d_output, N * sizeof(float));
cudaMemcpy(d_input, h_input, N * sizeof(float), cudaMemcpyHostToDevice);
// no any opt-in method
func<<<dim3(grid_size,1,1), dim3(block_size,1,1)>>>(d_output, d_input);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Failed to launch kernel: %s\n", cudaGetErrorString(err));
}
cudaMemcpy(h_output, d_output, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
for (int i = 0; i < 10; i++) {
std::cout << i << ": " << h_output[i] << std::endl;
}
delete[] h_input;
delete[] h_output;
return 0;
}