Hi all,
I’m still rather new to cuda and I’m struggling with an issue I can’ seem to debug.
I’m working on a Windows system with a Quadro P620 GPU.
For some reason I cannot seem to get the profilers working properly like I used to have on a Linux system, so I’m flying half blind.
I hope someby can give me a push in the right direction for this.
Below is some sample code for my problem.
This code will generate two vectors, one float* and one complex*.
A function will copy a part of the floats to the real part of the complex
In the second part some small vectors are made and send to the GPU. Without altering them they are copied back and printed out. The expectation is to get two times the same value being printed out.
For N = 16 I correctly get:
5
5
For N =32 I get:
5
-4.31602e+08
include “cuda_runtime.h”
include “device_launch_parameters.h”
include <stdio.h>
include “iostream”typedef float2 Complex;
global void copyImageToReal(float*, Complex*);int main()
{
int N = 32;
int height = 2160; // define height
int width = 4096; // define widthComplex* d_target;
cudaMalloc((void**)&d_target, sizeof(Complex)* height* width * N); // assign device memory
cudaMemset((void**)&d_target, 0, sizeof(Complex)* height* width * N); // assign 0 to all elementsfloat* d_source;
cudaMalloc((void**)&d_source, sizeof(float) * height * width); // assign device memory
cudaMemset((void**)&d_source, 1, sizeof(float)* height* width); // assign 1 to all elements//copyImageToReal <<< 8640, 1024 >>>(d_source, d_target); // put source data to real part of the target // Target needs to be larger than source
cudaDeviceSynchronize();
// Some random code that has two numbers going to the GPU and back
float* d_Polar;
float* h_Polar = (float*)malloc(sizeof(float) * 2);
float* h_target = (float*)malloc(sizeof(float) * 2);h_target[0] = 5;
h_target[1] = 7;std::cout << h_target[0] << std::endl;
cudaMalloc((void**)&d_Polar, sizeof(float) * 2); // assign device memory
cudaMemcpy(d_Polar, h_target, sizeof(float) * 2, cudaMemcpyHostToDevice); // copy data
cudaMemcpy(h_Polar, d_Polar, sizeof(float) * 2, cudaMemcpyDeviceToHost); // copy data
cudaDeviceSynchronize();std::cout << h_Polar[0] << std::endl;
cudaFree(d_target);
cudaFree(d_source);
cudaFree(d_Polar);
cudaDeviceReset();
return 0;
}global void copyImageToReal(float* src, Complex* dst) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
dst[idx].x = src[idx];
};