Hello,
I am parallelizing a self-written PSO Code. That’s a code fragment which
unfortunately does not work.
C_struct_Particle * C_struct_Swarm_optimize(C_struct_Swarm * s) {
C_struct_Swarm *gpu__s;
float *f = (float *)malloc(sizeof(f));
float *gpu__f;
static unsigned int gpuBytes = sizeof(C_struct_Particle *)
+ ((1024 * 1024) * sizeof(C_struct_Particle *));
static unsigned int gpuBytes_f = sizeof(*f);
CUDA_SAFE_CALL(cudaMalloc(((void * *) (&gpu__s)), gpuBytes));
printf("After cuda Malloc calculated sizeof = %ld\n", gpuBytes);
printf("After cuda Malloc calculated sizeof(s) = %ld\n", sizeof(s));
printf("After cuda Malloc calculated sizeof(*s) = %ld\n", sizeof(*s));
CUDA_SAFE_CALL(cudaMalloc(((void * *) (&gpu__f)), gpuBytes_f));
for (j = 0; j < 20; 20; j++) {
int err = cudaMemcpy(gpu__s, s, gpuBytes, cudaMemcpyHostToDevice);
printf("cudaMemcpy err code = %d\n", err);
err = cudaMemcpy(gpu__f, f, gpuBytes_f, cudaMemcpyHostToDevice);
printf("cudaMemcpy for FLOAT +++ err code = %d\n", err);
C_struct_Swarm_optimize_kernel0<<<dimGrid0, dimBlock0, 0, 0>>>();
}
return 0;
}
In the first and second(!) iteration both cuda Memcopies work with error code 0. In the next 18 iterations they return error code 4. If I comment out the kernel call "C_struct_Swarm_optimize_kernel0<<<dimGrid0, dimBlock0, 0, 0>>>();", the cuda memcopies work in every(!) iteration for both variables.
I know that this code is incomplete, but I just don’t understand why the errors appear in cudaMemcpy.