Hi guys, my test case for this to occur is finding y = x’*x where x is a column vector 56000 x 1 complex elements, x’ is conjugate transpose of x. The result is expected to be a real number, no imaginary part present (or =0). The test works fine for an example with small vector. However, with a large vector produced from matlab, matlab gave a correct result while cuda code only gave a correct real element but incorrect imaginary part (not only != 0 but also >>>0). For verification, I export this vector from matlab to 2 files, one for cuda code and one to read it back to matlab, then process on this same set of data. And the result is still the same as my description above. Can someone spot any error in my code or is this an error born from the difference between GPU’s and CPU’s way of working on double type variable. Below are C++ and matlab code for testing:

C++:

```
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cublas_v2.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
bool ReadFile(char* MyFile, cuDoubleComplex *array){
FILE* fi = fopen(MyFile, "rt");
if (fi != NULL){
int count = 0;
double temp;
while (!feof(fi)){
fscanf(fi, "%lf", &temp);
array[count].x = temp;
fscanf(fi, "%lf", &temp);
array[count].y = temp;
++count;
}
fclose(fi);
return 0;
}
return 1;
}
int main()
{
cuDoubleComplex *h_x = new cuDoubleComplex[56000];
// Stop program if any read op error
if (ReadFile("myFile6.txt", h_x)){
printf("Read file fail!");
return 1;
}
cuDoubleComplex h_y;
cuDoubleComplex alpha;
alpha.x = 1; alpha.y = 0;
cuDoubleComplex beta;
beta.x = 0; beta.y = 0;
cuDoubleComplex *d_x;
cuDoubleComplex *d_y;
cudaError_t cudaStatus;
cublasStatus_t stat;
cublasHandle_t handle;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&d_x, 56000 * sizeof(cuDoubleComplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_y, sizeof(cuDoubleComplex));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// cuBLAS handler
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(d_x, h_x, 56000 * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
stat = cublasZgemm_v2(handle, CUBLAS_OP_C, CUBLAS_OP_N,
1, 1, 56000,
&alpha, d_x, 56000,
d_x, 56000, &beta,
d_y, 1);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(&h_y, d_y, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
printf("y = x'*x = %f %fi\n",
h_y.x, h_y.y);
Error:
cudaFree(d_x);
cudaFree(d_y);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
```

Matlab:

```
fid = fopen ('myFile7.txt','r');
ii = 1;
while ~feof(fid)
x(ii, :) = str2num(fgets(fid));
ii = ii + 1;
end
fclose(fid);
y = x'*x;
```

Thanks so much for your support!

myFile7.txt (2.92 MB)

myFile6.txt (2.9 MB)