My device is RTX 2080 Ti and I try to write Cuda program on this device. However, when I try to allocate memory from the device, I always get the “out of memory” error from a specific command. I make sure that I have enough GPU memory and the most strange thing is that I can allocate more GPU memory for other points only except that. My code is as the following.
float caltrace_gpu(float* W, float* WTWInv, float * data, int N, long long M,
int C, int num_block, float* mean, float* rv, cublasHandle_t handle)
{
// Set cuda context
cudaError_t cudaStat;
cublasStatus_t stat;
// Initialize device pointer
float* d_data;
float* d_W;
float* d_WTWInv;
float* d_v;
float* d_result;
float* d_result2;
float* d_temp;
float* d_one;
float* d_mean;
float prod;
float* d_cby1;
float* d_cby2;
float* d_cby3;
float* d_R;
float* d_data_final;
float* d_temp_final;
//Initialize one vector in order to perform element wise sub
float* vector_ones = new float[N];
for (long long i = 0; i < N; i++)
{
vector_ones[i] = 1.0f;
}
// The coefficient needed in computation
float al = 1.0f;
float al_n = -1.0f;
float beta = 0.0f;
// settings for the blocks
int P = num_block;
int SNIPs = static_cast<int>(M / P + 1);
int SNIPs_final = M - SNIPs * (P - 1);
// Allocate memeory on device
cudaStat = cudaMalloc((void**)&d_data, SNIPs * N * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 1" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_W, N * C * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 2" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_WTWInv, C * C * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 3" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_v, N * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 4" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_result2, N * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 5" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_result, N * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 6" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_temp, SNIPs * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 7" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_one, N * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 8" << endl;
return EXIT_FAILURE;
}
// cudaStat = cudaMalloc((void**)&d_mean, SNIPs * sizeof(float));
cudaStat = cudaMalloc((void**)&d_cby1, C * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 9" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_cby2, C * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 10" << endl;
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&d_cby3, N * sizeof(float));
if (cudaStat != cudaSuccess) {
cout << "device memory allocation failed 11" << endl;
return EXIT_FAILURE;
}
I always fail to allocate memory for d_WTWInv. However, when I delete this command, I can successfully allocate memory for the latter pointers. The C is equal to 20.