Hi teams,
I find that my Nsight System could not profile unified memory as normal device memory.
Here are the test code and profile config and report.
Could you help me to find why?
__global__
void addOne(float *vec, size_t N) {
unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < N) vec[idx] = vec[idx] + 1.f;
}
int main() {
size_t nElement = 1 << 28;
size_t nThread = 1024;
float *vec;
size_t nBytes = nElement * sizeof(float);
cudaMallocManaged(&vec, nBytes);
memset(vec, 0, nBytes);
size_t nBlock = (nElement + nThread - 1) / nThread;
addOne<<<nBlock, nThread>>>(vec, nElement);
cudaDeviceSynchronize();
bool isSame = true;
for (size_t i = 0; i < nElement; ++i) {
if (vec[i] != 1.f) {
isSame = false;
}
}
printf("isSame?: %s", isSame ? "true" : "false");
cudaFree(vec);
}