As mentioned before:
here’s the complete program that does a sort on 4096 values.
[codebox]
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <assert.h>
// includes, project
#include <cutil.h>
#include <cufft.h>
//functions
double frand (void)
{
double value;
value = ((double) rand()/(RAND_MAX));
return value;
}
//kernels
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
global void fft_indx(int *fftind, int n, int inc)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i<inc)
{
fftind[i] = i + inc;
}
else
{
fftind[i] = i - inc;
}
}
global void find_corrmaxz(double *zmax, double *f4, int *shifted_x, int *shifted_y, int *reg_x, int *reg_y, int Nx, int Ny, int *fftindx, int *fftindy, int *edgel_x, int *edger_x, int *edgel_y, int *edger_y, int num)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
zmax[num] = f4[(num*Nx*Ny)+num];
if(i < Nx && j < Ny )
{
if(f4[(num*Nx*Ny)+i*Nx+j] >= zmax[num])
{
zmax[num] = f4[(num*Nx*Ny)+i*Nx+j];
reg_x[num] = j;
reg_y[num] = i;
shifted_x[num] = fftindy[j];
shifted_y[num] = fftindx[i];
}
if(reg_x[num] == 0)
{
edgel_x[num] = Nx - 1;
edger_x[num] = reg_x[num] + 1;
}
else if (reg_x[num] == Nx-1)
{
edgel_x[num] = reg_x[num] - 1;
edger_x[num] = 0;
}
else
{
edgel_x[num] = reg_x[num] - 1;
edger_x[num] = reg_x[num] + 1;
}
if(reg_y[num] == 0)
{
edgel_y[num] = Ny - 1;
edger_y[num] = reg_y[num] + 1;
}
else if (reg_y[num] == Ny-1)
{
edgel_y[num] = reg_y[num] - 1;
edger_y[num] = 0;
}
else
{
edgel_y[num] = reg_y[num] - 1;
edger_y[num] = reg_y[num] + 1;
}
}
// printf("edgel_x[%d] = %d, edger_x[%d] = %d, edgel_y[%d] = %d, edger_y[%d] = %d\n", num, edgel_x[num], num, edger_x[num], num, edgel_y[num], num, edger_y[num]);
}
/************************* Main ************************/
int main()
{
// check the compute capability of the device
int cuda_device = 0;
int num_devices=0;
float elapsed_time;
unsigned int timer;
cutCreateTimer(&timer);
CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
if(0==num_devices)
{
printf("your system does not have a CUDA capable device\n");
return 1;
}
// check if the command-line chosen device ID is within range, exit if not
if( cuda_device >= num_devices )
{
printf("choose device ID between 0 and %d\n", num_devices-1);
return 1;
}
cudaSetDevice( cuda_device );
cudaDeviceProp device_properties;
CUDA_SAFE_CALL( cudaGetDeviceProperties(&device_properties, cuda_device) );
if( (1 == device_properties.major) && (device_properties.minor < 1))
{
printf("%s does not have compute capability 1.1 or later\n\n", device_properties.name);
}
printf("running on: %s\n\n", device_properties.name );
// create CUDA event handles
cudaEvent_t start_event, stop_event;
cudaEventCreate(&start_event);
cudaEventCreate(&stop_event);
cudaEventRecord(start_event, 0);
int Nx = 64;
int Ny = 64;
int sizex = 72;
int sizey = 43;
int sizez = 3096;
unsigned int seed = 123456789;
int i;
int incx = Nx >> 1;
int incy = Ny >> 1;
double *f1, *f1_d;
cudaMallocHost((void**) &f1, sizeof(double) * Nx * Ny * sizez);
checkCUDAError("cudaMallocHost calls");
cudaMalloc((void**) &f1_d, sizeof(double) * Nx * Ny * sizez);
checkCUDAError("cudaMalloc calls");
// srand(seed);
for(i = 0; i < Nx * Ny * sizez; ++i)
{
f1[i] = frand();
// printf("f[%d] = %lf\n",i, f1[i] );
}
cudaMemcpy(f1_d, f1, sizeof(double) * Nx * Ny * sizez, cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy calls");
/* Initialize fftindx and fftindy */
int *fftindx_d, *fftindy_d;
cudaMalloc((void**) &fftindx_d, sizeof(int) * Nx);
cudaMalloc((void**) &fftindy_d, sizeof(int) * Ny);
// Check for any CUDA errors
checkCUDAError("cudaMalloc calls");
dim3 block3(32,16);
dim3 grid3(2,4);
fft_indx <<< grid3, block3 >>> (fftindx_d, Nx, incx);
fft_indx <<< grid3, block3 >>> (fftindy_d, Ny, incy);
double *zmax;
double *zmax_h;
cudaMalloc((void**) &zmax, sizeof(double) * sizez);
cudaMallocHost((void**) &zmax_h, sizeof(double) * sizez);
cudaMemset(zmax, 0, sizeof(double) * sizez);
int *s_x, *s_y, *r_x, *r_y;
cudaMalloc((void**) &s_x, sizeof(int) * sizez);
cudaMalloc((void**) &s_y, sizeof(int) * sizez);
cudaMalloc((void**) &r_x, sizeof(int) * sizez);
cudaMalloc((void**) &r_y, sizeof(int) * sizez);
cudaMemset(s_x, 0, sizeof(int) * sizez);
cudaMemset(s_y, 0, sizeof(int) * sizez);
cudaMemset(r_x, 0, sizeof(int) * sizez);
cudaMemset(r_y, 0, sizeof(int) * sizez);
int *edgel_x, *edger_x, *edgel_y, *edger_y;
cudaMalloc((void**) &edgel_x, sizeof(int) * sizez);
cudaMalloc((void**) &edgel_y, sizeof(int) * sizez);
cudaMalloc((void**) &edger_x, sizeof(int) * sizez);
cudaMalloc((void**) &edger_y, sizeof(int) * sizez);
cudaMemset(edgel_x, 0, sizeof(int) * sizez);
cudaMemset(edgel_y, 0, sizeof(int) * sizez);
cudaMemset(edger_x, 0, sizeof(int) * sizez);
cudaMemset(edger_y, 0, sizeof(int) * sizez);
int *s_xh, *s_yh, *r_xh, *r_yh;
cudaMallocHost((void**) &s_xh, sizeof(int) * sizez);
cudaMallocHost((void**) &s_yh, sizeof(int) * sizez);
cudaMallocHost((void**) &r_xh, sizeof(int) * sizez);
cudaMallocHost((void**) &r_yh, sizeof(int) * sizez);
int *edgel_xh, *edger_xh, *edgel_yh, *edger_yh;
cudaMallocHost((void**) &edgel_xh, sizeof(int) * sizez);
cudaMallocHost((void**) &edgel_yh, sizeof(int) * sizez);
cudaMallocHost((void**) &edger_xh, sizeof(int) * sizez);
cudaMallocHost((void**) &edger_yh, sizeof(int) * sizez);
for(int num = 0; num < sizey; ++num)
{
find_corrmaxz <<< grid3, block3 >>> (zmax, f1_d, s_x, s_y, r_x, r_y, Nx, Ny, fftindx_d, fftindy_d, edgel_x, edger_x, edgel_y, edger_y, num);
cudaThreadSynchronize();
}
cudaMemcpy(zmax_h, zmax, sizeof(double) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(s_xh, s_x, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(s_yh, s_y, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(r_xh, r_x, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(r_yh, r_y, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(edgel_xh, edgel_x, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(edgel_yh, edgel_y, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(edger_xh, edger_x, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(edger_yh, edger_y, sizeof(int) * sizez, cudaMemcpyDeviceToHost);
for(i = 0; i < sizey; ++i)
{
printf("zmax_h[%d] = %lf, shifted_x = %d, shifted_y = %d... reg_x = %d, reg_y = %d\n", i, zmax_h[i], s_xh[i], s_yh[i], r_xh[i], r_yh[i]);
printf("edgel_x[%d] = %d, edger_x[%d] = %d, edgel_y[%d] = %d, edger_y[%d] = %d\n", i, edgel_xh[i], i, edger_xh[i], i, edgel_yh[i], i, edger_yh[i]);
}
cudaFreeHost(s_xh);
cudaFreeHost(s_yh);
cudaFreeHost(r_xh);
cudaFreeHost(r_yh);
cudaFreeHost(zmax_h);
cudaFreeHost(edgel_xh);
cudaFreeHost(edgel_yh);
cudaFreeHost(edger_xh);
cudaFreeHost(edger_yh);
checkCUDAError("cudaFreeHost calls");
// Check elapsed time
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(f1);
cudaFree(f1_d);
cudaFree(fftindx_d);
cudaFree(fftindy_d);
cudaFree(s_x);
cudaFree(s_y);
cudaFree(r_x);
cudaFree(r_y);
cudaFree(zmax);
cudaFree(edgel_x);
cudaFree(edgel_y);
cudaFree(edger_x);
cudaFree(edger_y);
checkCUDAError("cudaFree calls");
printf("\n\n Elapsed time on GPU = %.5f ms\n", elapsed_time);
return 0;
}
[/codebox]
So the kernel computes the maximum value among 4096 values and then records its position.
Doubts:
-
Is this the correct way to distribute data sort within a kernel?
-
I need to run the kernel 3096 times. But I am sticking to 43 times, because my data printed out from the kernel is incorrect after 43.(page locked memory limit??)
-
I haven’t used cudathreadsynchronize as that is important only for timing the kernel.