Well, the problem that I got into for several days is that, the cublasSasum_v2 works fine when the output(dresults) is on host memories(see the code ->“float *dresults = new float[C];”), but when the output is on device memories, it throws error(with the output given by the code->“float *dresults = nullptr;CHECK(cudaMalloc((void **) &dresults, C * sizeof(float)));”)! Anybody has an idea to save me out?!
int size = H * W, i;
cublasStatus_t status;
cublasHandle_t handle[C];
for (i = 0; i < C; ++i) {
status = cublasCreate_v2(&handle[i]);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cout << "#" << i << ", CUBLAS initialization error:" << status << std::endl;
abort();
}
cublasSetStream_v2(handle[i], stream);
}
// float *dresults = new float[C];
float *dresults = nullptr;
CHECK(cudaMalloc((void **) &dresults, C * sizeof(float)));
for (i = 0; i < C; ++i) {
status = cublasSasum_v2(handle[i], size, ((float *) gpu_buffer_in) + size * i, 1, &dresults[i]);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cout << "num:" << i << ", Cublas failure: " << status << std::endl;
abort();
}
}
CHECK(cudaFree(dresults));
for (i = 0; i < C; ++i) {
cublasDestroy_v2(handle[i]);
}