I think your understanding is wrong. Personally I would avoid explicitly dealing with cuda contexts whenever possible. Simply call cudaSetDevice(deviceId) to select the active device for the allocations, kernel, etc. You can switch gpu whenever you like, also from the same thread.
int main(){
int* d_array0;
int* d_array1;
cudaSetDevice(0);
cudaMalloc(&d_array0, sizeof(int)); //allocate on gpu 0
cudaSetDevice(1);
cudaMalloc(&d_array1, sizeof(int)); //allocate on gpu 1
cudaSetDevice(0);
kernel<<<...>>>(d_array0); //kernel runs on gpu 0
cudaSetDevice(1);
kernel<<<...>>>(d_array1); //kernel runs on gpu 1
cudaSetDevice(0);
cudaDeviceSynchronize(); //synchronize gpu 0
cudaFree(d_array0);
cudaSetDevice(1);
cudaDeviceSynchronize(); //synchronize gpu 1
cudaFree(d_array1);
}