Hi there guys, I have just started working with CUDA.
Something in my kernal is going awry, simply put I am multiplying a 2d matrix by itself on the device and then outputting the result back on the host.
The problem is that only some values get outputted, any help would be very much apreciated.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
__global__ void square_array(int *a, int pit)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int idx = iy * pit + ix;
if (ix < pit && iy < pit) {
a[idx] = a[idx] * a[idx];
}
}
int main(void)
{
const int N = 10; // Number of elements in arrays
int *a_d; // Pointer to device array
int *p, p_h[N][N];
size_t pitch = 10;
dim3 block(N,N);
size_t size = N * sizeof(int);
int (*arrayString)[N];
int (*arrayString2)[N];
arrayString2 = (int(*)[N]) malloc(N * sizeof *arrayString); // Allocate array on host
arrayString = (int(*)[N]) malloc(N * sizeof *arrayString);
for (int i = 0; i< N; i++)
{
for (int j = 0; j< N; j++)
{
arrayString[j][i] = (i*10) + j;
printf("test1: %d\n", arrayString[j][i]);
}
}
cudaMallocPitch((void**) &p, &pitch, N*sizeof(int), N); // Allocate array on device
cudaMemcpy2D(p, pitch, arrayString, N*sizeof(int), N*sizeof(int),N ,cudaMemcpyHostToDevice);
square_array <<< 1, block >>> (p, pitch);
cudaMemcpy2D(arrayString2,N*sizeof(int),p,pitch,N*sizeof(int),N,cudaMemcpyDeviceToHost);
for (int i = 0; i< N; i++)
{
for (int j = 0; j< N; j++)
{
printf("test2: %d\n", arrayString2[j][i]);
}
}
return 0;
}
Thanks
Paul