Hello. I work with CUDA not very long and have some problem.
I compute sum of two vectors in one block, but if I change elements count of vectors result becomes wrong.
My code:
[codebox]#include <stdio.h>
#include <memory.h>
#include <cuda_runtime_api.h>
#define CHECK_CUDA_ERROR(err){if (err != cudaSuccess) {printf(cudaGetErrorString(err));exit(-1);}}
#define MAX_X (100)
#define MAX_Y (100)
#define MAX_Z (10)
global void addVector(float* a, float* b, float* c)
{
int x = threadIdx.x;
int y = threadIdx.y;
int z = threadIdx.z;
int idx = y * MAX_X + x + z * MAX_X * MAX_Y + z;
c[idx] = a[idx] + b[idx] + 3.0f;
};
#define MAX_ELEMENTS (MAX_X * MAX_Y * MAX_Z)
int main()
{
int byteSize = MAX_ELEMENTS * sizeof(float);
float* vec = new float[MAX_ELEMENTS];
float* devVec1;
float* devVec2;
float* devVec3;
CHECK_CUDA_ERROR(cudaMalloc((void**)&devVec1, byteSize))
CHECK_CUDA_ERROR(cudaMalloc((void**)&devVec2, byteSize))
CHECK_CUDA_ERROR(cudaMalloc((void**)&devVec3, byteSize))
cudaMemset(devVec1, 0, byteSize);
cudaMemset(devVec2, 0, byteSize);
cudaEvent_t syncEvent;
cudaEventCreate(&syncEvent);
dim3 blocks = dim3();
dim3 threads = dim3(MAX_X, MAX_Y, MAX_Z);
addVector<<<blocks, threads>>>(devVec1, devVec2, devVec3);
cudaEventRecord(syncEvent, 0);
cudaEventSynchronize(syncEvent);
cudaMemcpy(vec, devVec3, byteSize, cudaMemcpyDeviceToHost);
cudaEventDestroy(syncEvent);
printf("First element: %f\n", vec[0]);
printf("Second element: %f\n", vec[MAX_ELEMENTS - 1]);
cudaFree(devVec1);
cudaFree(devVec2);
cudaFree(devVec3);
delete[] vec;
return 0;
}
[/codebox]
Block size on my GeForce 9600M GS is 51251264. How I can change MAX_X, MAX_Y and MAX_Z. For example:
if MAX_X = 100 MAX_Y = 100 and MAX_Z = 10 then all elements in result zero. Help if can.
P.S.: How I can use all threads in block for computing?