Dear my helper:
I want to calculate percentile of ~1000 columns together, each columns with ~25000 data points.
First I use sorting method to get my result.
But I now have 2 problems,
(1) Some errors happened when I use DATA size of 2560 x 1024, but okay for small size. Pls help to modify it. (for line 16 //for actually large data test)
(2) The sorting method “shell sort” do not have good performance for GPU. Any better methods ?
Can you help to modify it ?
Thanks in advance.
My hardware is 9800 GT with 1G RAM, 14 multiprocessors.
iamjyj801
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
//for simple data check
#define DATA_SIZE 102 // 51250*1024 //data 25600 x 1024 (row x column)
#define BLOCK_NUM 2 //1024 //equal to column number
#define THREAD_NUM 10 //512 //each column with THREAD_NUM threrads
/*
//for test too
#define DATA_SIZE 1011024 //data 25600 x 1024 (row x column)
#define BLOCK_NUM 1024 //equal to column number
#define THREAD_NUM 10 //each column with THREAD_NUM threrads
//for actually large data test
#define DATA_SIZE 512 * 50 * 1024 //data 25600 x 1024 (row x column)
#define BLOCK_NUM 1024 //equal to column number
#define THREAD_NUM 512 //each column with THREAD_NUM threrads
*/
#define DATA_ROW DATA_SIZE/BLOCK_NUM //each column with DATAROW for sort
float data[DATA_SIZE]; //now use one dimension array, want to change to 2 array
int floatCompare(const void* a, const void* B); //for CPU sort
void shellsort2(float number, int MAX) ; //for CPU sort
global static void SortByGPU(float num, float result_set1, float * result_sum)
{
//extern
__shared__ float shared_set1[BLOCK_NUM];
//extern
__shared__ float shared_sum[BLOCK_NUM];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
shared_set1[tid] = 0;
shared_sum[tid] = 0;
float temp;
int index, index_addgap;
int i, j,k,gap, MAX;
MAX = DATA_ROW;
for (gap = DATA_ROW ; gap > 0; gap /= 2){ //use shell sort
//each thread calculate their part for(k = 0; k < gap; k++) { }
if (threadIdx.x <gap )
{
k = threadIdx.x;
for(i = k+gap; i < MAX; i+=gap) {
for(j = i - gap; j >= k; j-=gap) {
index = j * BLOCK_NUM + bid ; //Since I use one dimension array to simulate 2 dim
index_addgap= (j+gap) * BLOCK_NUM + bid ;
if(num[index] > num[index_addgap]) { //SWAP(num[index], num[index+gap]);
temp = num[index]; //x
num[index] = num[index_addgap]; //x = y
num[index_addgap] = temp ; //y = t
}
else break;
}
}
}
__syncthreads();
}
}
void GenerateNumbers(float *number, int size)
{
int mycolumns=BLOCK_NUM;
for(int k = 0; k < mycolumns; k++) {
for(int j = 0; j < (DATA_ROW); j++) {
//number[j*mycolumns+k] = (float) (rand() % 1000 +0.32 ) *1 ; //just for float
number[j*mycolumns+k] = (float) (DATA_ROW - j ) ; //for check result
}
}
for (int k=0; k< DATA_SIZE; k++)
{ printf("k=%d %f\n",k, number[k] ); if (k>= 21) break; //only print out 20 data
}
fprintf(stderr, "Data generated.\n");
getchar();
}
bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
#define SWAP(x,y) {float t; t = x; x = y; y = t;}
int floatCompare(const void* a, const void* B)
{
union{
float result;
int retval;
}value;
value.result = ((float)a > (float)B);
return value.retval;
}
void shellsort2(float number, int MAX) {
int i, j, k, gap;
for (gap = MAX / 2; gap > 0; gap /= 2){
for(k = 0; k < gap; k++) {
for(i = k+gap; i < MAX; i+=gap) {
for(j = i - gap; j >= k; j-=gap) {
if(number[j] > number[j+gap]) {
SWAP(number[j], number[j+gap]);
}
else
break;
}
}
}
}
}
int main()
{
if(!InitCUDA()) {
return 0;
}
printf(“CUDA initialized.\n”);
GenerateNumbers(data, DATA_SIZE);
float result_check[DATA_SIZE];
unsigned int timer;
cutCreateTimer(&timer);
cutStartTimer(timer);
float* gpudata, *result_set1, *result_sum;
// clock_t* time;
cudaMalloc((void**) &gpudata, sizeof(float) * DATA_SIZE);
// cudaMalloc((void**) &result_set1, sizeof(float) * BLOCK_NUM);
cudaMalloc((void**) &result_set1, sizeof(float) * BLOCK_NUM); //special add sum after sum square
cudaMalloc((void**) &result_sum , sizeof(float) * BLOCK_NUM);
// cudaMalloc((void**) &time, sizeof(clock_t) * BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(float) * DATA_SIZE, cudaMemcpyHostToDevice);
//sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(float) * 2 >>>(gpudata, result_set1, result_sum );
SortByGPU<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(float) * 2 >>>(gpudata, result_set1, result_sum );
float set1[BLOCK_NUM], sum0[BLOCK_NUM];
//clock_t time_used[BLOCK_NUM * 2];
cudaMemcpy(&set1, result_set1, sizeof(float) * BLOCK_NUM , cudaMemcpyDeviceToHost);
cudaMemcpy(&sum0 , result_sum , sizeof(float) * BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&result_check , gpudata , sizeof(float) * DATA_SIZE, cudaMemcpyDeviceToHost); //copy back data
cudaFree(gpudata);
cudaFree(result_set1);
cudaFree(time);
cutStopTimer(timer);
float GPUTime = cutGetTimerValue(timer);
printf(“GPU time): %0.7f\n”, GPUTime);
for (int k=0; k< DATA_SIZE; k++)
{ printf("k=%d %f\n",k, result_check[k] ); if (k>= 21) break; //only print out 20 data
if (k> 21) break;
}
fprintf(stderr, "Data generated.\n");
// next for CPU
cutResetTimer(timer);
cutStartTimer(timer); //Timer begin
float tmp;
int mycolumns=BLOCK_NUM;
float be_sorted_data[DATA_ROW]={0};
for(int k = 0; k < mycolumns; k++) {
for(int j = 0; j < (DATA_ROW); j++) { //loop for each column
tmp=data[j*mycolumns+k]; //each data for column j
be_sorted_data[j]=tmp;
}
shellsort2( be_sorted_data, DATA_ROW);
}
cutStopTimer(timer); //Timer end
float CPUTime = cutGetTimerValue(timer);
printf("CPU time): %0.7f\n", CPUTime);
printf("GPU time): %0.7f\n", GPUTime);
getchar();
return 0;
}
[/codebox]
For_cuda_sort2.zip (63.9 KB)