This is a function of mine to sort a large FEM matrix in COO format. It works for the most part, except with very large datasets (currently around 10GB). I am working on an A30. As you can see, I have it print how much memory it wants for the buffer in step 3.
For the dataset in question, it wants to allocate just over 10GB, when the A30 still has over 13 GB of memory free. It crashes as soon as it tries to allocate in step 4, with an out of memory error. Before it does this however, it completely fills my 128GB of system RAM. Why is it telling me it only wants 10GB, but then proceeds to fill more than double that across the VRAM and system RAM? Is there something obviously wrong with my code? Keep in mind, the crash is in step 4 with the cudamalloc function, wherein I tell it exactly how much to allocate, which is the 10GB.
void sortCOOMatrix(cusparseHandle_t cusparseHandle, int rows, int cols, int nnz, int* d_cooRow, int* d_cooCol, real* d_cooVal) {
// Step 1: Create arrays for sorted values and permutation, permutation is used for sorting value array after row-column sort
int* d_permutation;
real* d_values_sorted;
void* d_buffer;
size_t bufferSize;
// Allocate memory for sorted values and permutation array
checkCudaError(cudaMalloc((void**)&d_values_sorted, nnz * sizeof(real)), "Failed to allocate memory for sorted values");
checkCudaError(cudaMalloc((void**)&d_permutation, nnz * sizeof(int)), "Failed to allocate memory for permutation array");
// Step 2: Create descriptors for sparse and dense vectors
cusparseSpVecDescr_t vec_permutation;
cusparseDnVecDescr_t vec_values;
// Create a sparse vector descriptor for permutation and a dense vector for the original values
checkCusparseError(cusparseCreateSpVec(&vec_permutation, nnz, nnz, d_permutation, d_values_sorted, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_PRECISION), "Failed to create sparse vector descriptor for permutation");
checkCusparseError(cusparseCreateDnVec(&vec_values, nnz, d_cooVal, CUDA_PRECISION), "Failed to create dense vector descriptor for values");
// Step 3: Determine buffer size required for sorting
checkCusparseError(cusparseXcoosort_bufferSizeExt(cusparseHandle, rows, cols, nnz, d_cooRow, d_cooCol, &bufferSize), "Failed to determine buffer size for COO sorting");
printf("\nSort buffer requires %zu MB of memory\n", bufferSize / (1024*1024));
// Step 4: Allocate the buffer memory needed for sorting
checkCudaError(cudaMalloc(&d_buffer, bufferSize), "Failed to allocate buffer for sorting");
// Step 5: Create an identity permutation array to retain initial order
checkCusparseError(cusparseCreateIdentityPermutation(cusparseHandle, nnz, d_permutation), "Failed to create identity permutation");
// Step 6: Sort the COO matrix by rows (and by columns within rows)
checkCusparseError(cusparseXcoosortByRow(cusparseHandle, rows, cols, nnz, d_cooRow, d_cooCol, d_permutation, d_buffer), "Failed to sort COO matrix by rows");
// Step 7: Apply the sorted permutation to the values array to reorder them according to sorted row and column indices
checkCusparseError(cusparseGather(cusparseHandle, vec_values, vec_permutation), "Failed to gather sorted values");
// Step 8: Copy sorted values back to the original values array
checkCudaError(cudaMemcpy(d_cooVal, d_values_sorted, nnz * sizeof(real), cudaMemcpyDeviceToDevice), "Failed to copy sorted values back to the original array");
// Step 9: Clean up allocated resources
checkCusparseError(cusparseDestroyDnVec(vec_values), "Failed to destroy dense vector descriptor");
checkCusparseError(cusparseDestroySpVec(vec_permutation), "Failed to destroy sparse vector descriptor");
checkCudaError(cudaFree(d_values_sorted), "Failed to free sorted values memory");
checkCudaError(cudaFree(d_buffer), "Failed to free buffer memory");
checkCudaError(cudaFree(d_permutation), "Failed to free permutation array memory");
}