My problem is posted in this link http://stackoverflow.com/questions/28090950/cuda-passing-user-defined-structure-to-a-kernel-failed.
My architecture is: “GeForce 210”, CUDA 5.0, with a cuda capabiliy Major/Minor version 1.2
Here is my compiling code below, in a “kernel.cu” file:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuPrintf.cu"
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
typedef struct
{
float a, b;
} point;
struct __Q_VECTOR__{
double* Data;
int Dimension;
int Cluster;
};
typedef struct __Q_VECTOR__ VQ_VECTOR;
__global__ void testKernel(VQ_VECTOR *X, int N){
int i= blockIdx.x*blockDim.x + threadIdx.x;
cuPrintf("\n testKernel entrance by the global threadIdx= %d\n", i);
for(int k=0; k<X[i].Dimension; k++)
cuPrintf("%.3f, ",X[i].Data[k]);
cuPrintf("\n");
}
int main(void){
int L = 3, //.Data length
N = 100;
VQ_VECTOR *A,
*device_VQ_VECTOR;
cudaError_t cudaStatus;
A = (VQ_VECTOR*)malloc(N*sizeof(VQ_VECTOR));
for(int i=0; i<N; i++){
VQ_VECTOR a;
a.Data = (double*)malloc(L*sizeof(double));;
a.Cluster = 1;
a.Dimension = L;
for(int j=0; j<L; j++)
a.Data[j]=(1+i)*(1+j);
A[i] = a;
}
//Prinf of all the elements of A
for(int i=0; i<2; i++){
printf("\nA[%d]={", i);
for(int j=0; j<L; j++)
printf("%.3f",A[i].Data[j]);
printf("}\n");
}
printf("\n\n");
//I Allocate and Copy data from A to device_VQ_VECTORon the GPU memory
cudaDeviceReset();
cudaStatus = cudaMalloc((void**)&device_VQ_VECTOR, N*sizeof(VQ_VECTOR));
cudaStatus = cudaMemcpy(device_VQ_VECTOR, A, N*sizeof(VQ_VECTOR), cudaMemcpyHostToDevice);
for(int i = 0; i != N; ++i) {
/* can't access device_VQ_VECTOR[i].Data directly from host-side,
* working around it with proxy variable */
double *out;
cudaMalloc(&out, L*sizeof(double));
cudaMemcpy(out, A[i].Data, L*sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(&device_VQ_VECTOR[i].Data, &out, sizeof(void*),
cudaMemcpyHostToDevice);
// will re-allocate later, for simplicity sake
free(A[i].Data);
}
cudaPrintfInit();
testKernel<<<N,1>>>(device_VQ_VECTOR, N);//to test and see on a sigle thread
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
return 1;
}
cudaStatus = cudaMemcpy(A, device_VQ_VECTOR, N*sizeof(VQ_VECTOR), cudaMemcpyDeviceToHost);
for(int i = 0; i != N; ++i) {
// allocate array, copy data
double *array = (double*)malloc(L*sizeof(double));
cudaMemcpy(array, A[i].Data, L*sizeof(double),
cudaMemcpyDeviceToHost);
// assign new array to A[i]
A[i].Data = array;
}
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "\n testKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
return 1;
}
for(int i=0; i<2; i++){
printf("\nA[%d]={", i);
for(int j=0; j<L; j++)
printf("%.3f, ",A[i].Data[j]);
printf("}\n");
}
cudaFree(device_VQ_VECTOR);
// don't forget to free A and all its Data
return 0;
}
The output are not what I expected, beacause I have results like (it is huge, I don’t want to post too much too):
[1, 0]: 0.000, [1, 0]: 0.000, [1, 0]: 0.000, [1, 0]:
[2, 0]:
testKernel entrance by the global threadIdx= 2
[2, 0]: 0.000, [2, 0]: 0.000, [2, 0]: 0.000, [2, 0]:
[3, 0]:
testKernel entrance by the global threadIdx= 3
[3, 0]: 0.000, [3, 0]: 0.000, [3, 0]: 0.000, [3, 0]:
[4, 0]:
testKernel entrance by the global threadIdx= 4
[4, 0]: 0.000, [4, 0]: 0.000, [4, 0]: 0.000, [4, 0]:
[5, 0]:
testKernel entrance by the global threadIdx= 5
[5, 0]: 0.000, [5, 0]: 0.000, [5, 0]: 0.000, [5, 0]:
[6, 0]:
I don’t know what’s going wrong, because others don’t have these bad results with only zeroes. Is it a problem with my device architecture? What can I do to solve this problem?