I have the code:
time_start_block = clock();
cudaSetDevice(0);
float *matrix_device_0;
cudaMalloc((void **) &matrix_device_0, N * N * sizeof(float) );
float *phi_device_0;
cudaMalloc((void **) &phi_device_0, N * sizeof(float) );
float *vector_TEMP_device_0;
cudaMalloc((void **) &vector_TEMP_device_0, N * sizeof(float) );
cudaMemcpy(matrix_device_0, matrix_A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaSetDevice(1);
float *matrix_device_1;
cudaMalloc((void **) &matrix_device_1, N * N * sizeof(float) );
float *phi_device_1;
cudaMalloc((void **) &phi_device_1, N * sizeof(float) );
float *vector_TEMP_device_1;
cudaMalloc((void **) &vector_TEMP_device_1, N * sizeof(float) );
cudaMemcpy(matrix_device_1, matrix_B, N * N * sizeof(float), cudaMemcpyHostToDevice);
time_end_block = clock();
time_dif_block = (float)(time_end_block-time_start_block) / CLOCKS_PER_SEC;
The measured execution time is approximately 45 seconds (Tesla S1070).
Why it takes so long?