Hi! I want to use 49KB shared memory in 3050 per block, so I have to use dynamic shared memory and follow official guide, but still can no work. Anyone can kindly find my bug? Thank you!!
#include<iostream>
using namespace std;
#include <cstdint>
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <vector>
#include "cuda_runtime.h"
__global__ void sgemm_128x128x8(){
if (threadIdx.x == 0 && blockIdx.x==0 && blockIdx.y==0) {
printf("haha\n");
}
extern __shared__ float smem[];
float* smem_a = smem;
float* smem_b = (float*)&smem_a[256*8];
smem_a[threadIdx.x * 8] = threadIdx.x/3;
__syncthreads();
}
int main(){
int maxbytes = 49*1024;
cudaFuncSetAttribute(sgemm_128x128x8, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
sgemm_128x128x8 << <1, 256, 49 * 1024 * sizeof(float) >> > ();
}