now,I’m always confused about the behavior of cudaMalloc(cudaMemcpy,cudaFree as well)'s synchronize. the test code is beblow:
include <pthread.h>
include <stdio.h>
const int N = 1 << 20;
global void kernel(float* x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x)
{
x[i] = sqrt(pow(3.14159, i));
}
}
void* launch_kernel(void* dummy)
{
float* data;
cudaMalloc(&data, N * sizeof(float));
// cudaStream_t stream;
// cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
kernel<<<1, 64>>>(data, N);
float* data1;
cudaMalloc(&data1, N * sizeof(float));
cudaStreamSynchronize(0);
return NULL;
}
int main()
{
const int num_threads = 8;
pthread_t threads[num_threads];
for (int i = 0; i < num_threads; i++)
{
if (pthread_create(&threads[i], NULL, launch_kernel, 0))
{
fprintf(stderr, "Error creating threadn");
return 1;
}
}
for (int i = 0; i < num_threads; i++)
{
if (pthread_join(threads[i], NULL))
{
fprintf(stderr, "Error joining threadn");
return 2;
}
}
cudaDeviceReset();
return 0;
}
and the nsight system report is captured as blow:
so my question is that :
if cudaMalloc synchronize the device with host,the end of second cudaMalloc’s timeline in the cuda API(host) must be the behind the end of the kernel executing on CUDA HW(device).
does cudaMalloc means an implicit cudaStreamSynchronize in the default stream legacy? or an implicit cudaDeviceSynchronize throughout the device?or some thing else?
if I put the default stream per thread compile option on,is it means cudaMalloc ,cudaMemcpy,cudaFree running on the default stream per thread?can cudaMalloc ,cudaMemcpy,cudaFree block host and other streams?does the default stream legacy exists if default stream per thread compile option is on?
I really appreciate it if you reply my question quickly,thanks!!
