I took my old code that I wanted to run on the RTX 2060 super. But I can’t enable Shared Memory.
I’m using Java. Cuda version 12.0
public static int SharedMemorySizeGPU = 64 * 1024;
Previously I used: cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
Now I run:
JCudaDriver.cuFuncSetAttribute(function, cudaFuncAttributePreferredSharedMemoryCarveout, cudaSharedmemCarveoutMaxShared);
JCudaDriver.cuFuncSetAttribute(function, cudaFuncAttributeMaxDynamicSharedMemorySize, SharedMemorySizeGPU);
But Shared Memory remains unavailable. no error occurs, I get zeros.
device int SharedMemorySize = 64 * 1024 / 4;
“extern "C"\n” +
“global void derSoftmax(const float* restrict output, const float* restrict error, float* data, int row, int column)\n” +
“{\n” +
" device extern shared float shared;\n" +
" int k = blockDim.x * blockIdx.x + threadIdx.x;\n" +
" int i = blockDim.y * blockIdx.y + threadIdx.y;\n" +
" if (k < row && i < column)\n" +
" {\n" +
" int idx = k * blockDim.y * gridDim.y + i;\n" +
" if (idx < SharedMemorySize) {\n" +
" shared[idx] = error[idx];\n" +
" }\n" +
" __syncthreads();\n" +
" float value;\n" +
" int index = k * column;\n" +
" int indexI = index + i;\n" +
" data[indexI] = 0.0f;\n" +
" int indexJ = index;\n" +
" float o = output[indexI];\n" +
" float sum = 0;\n" +
" for (int j = 0; j < column; j++, indexJ++) {\n" +
" if (i != j) {\n" +
" value = o * -output[indexJ];\n" +
" } else {\n" +
" value = o * (1.0f - o);\n" +
" }\n" +
" if (indexJ < SharedMemorySize) {\n" +
" sum += shared[indexJ] * value;\n" +
" }\n" +
" else {\n" +
" sum += error[indexJ] * value;\n" +
" }\n" +
" }\n" +
" data[indexI] = sum;\n" +
" }\n" +
“}\n”;
Please, help…