Hi! I am trying to allocate consecutive memory in GPU and then split them into several ptr, but I can run below code in my 3050PC win10, can not in A100 linux server!
#include "cuda_runtime.h"
float* random_matrix(int row, int col) {
float* mat = new float[row * col];
for (int i = 0; i < row; ++i) {
for (int j = 0; j < col; ++j) {
if (i * col + j + 1 < 100) {
mat[i * col + j] = i * col + j + 1;
}
else {
mat[i * col + j] = 10;
}
}
}
return mat;
}
int main()
{
const int m = 4000, k = 2000, n = 128, n1 = 128, n2 = 128, n3 = 10;
float* init = random_matrix(1, (k*n+n*n1+n1*n2+n2*n3));
float* c = new float[m * n3];
float* a = random_matrix(k, m);
float* dev_init, *dev_c, *dev_a;
cudaMalloc((void**)&dev_init, (k*n+n*n1+n1*n2+n2*n3) * sizeof(float));
cudaMalloc((void**)&dev_c, m * n3 * sizeof(float));
cudaMalloc((void**)&dev_a, m * k * sizeof(float));
cudaMemcpy(dev_init, init, (k*n+n*n1+n1*n2+n2*n3) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_a, a, k * m * sizeof(float), cudaMemcpyHostToDevice);
float* dev_b = (float*)&dev_init[n * k];
float* dev_n2 = (float*)&dev_b[n * n1];
float* dev_n3 = (float*)&dev_wei2[n1 * n2];
float* dev_n4 = (float*)&dev_wei3[n2 * n3];
}
(base) ????:~/test/??# nvcc New.cu --ptxas-options=-v -arch=sm_80 -maxrregcount 128
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z15???iiiiiiPfS_S_S_S_S_' for 'sm_80'
ptxas info : Function properties for _Z15???iiiiiiPfS_S_S_S_S_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 128 registers, 424 bytes cmem[0], 8 bytes cmem[2]
(base) ????:~/test/??# ./a.out
Time = 3.58732e-43 ms
Maybe cache more data in L2 will spend some of the L1? So My previous shared memory allocation is too much? But this works on my cuda11.7-3050PC?? Maybe A100 is sm_80 and 11.6, which is not that clever?
But on my 3050PC it returns:
Time = 22.6877 ms
Same code!!!