Dear all;
please read my program
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N 4096
#define P 4
__global__ void myKernel(int *a, int *temp, int maxId)
{
int threadId = threadIdx.x + blockIdx.x*blockDim.x;
int v1 = maxId;
for (int i = (v1/2); i >0; i--){
if (0 <= threadId && threadId < i){
int rec_num = (2 ^ (v1 - i)) * 2048;
int index1 =rec_num * (threadId);
int endIndex1 = index1 + 1;
int index2 = endIndex1 + 1;
int endIndex2 = index2 + 1;
int targetIndex = index1;
int sortedSize = 0;
while (index1 <= endIndex1 && index2 <= endIndex2){
if (a[index1] <= a[index2]){
temp[targetIndex] = a[index1];
++sortedSize;
++index1;
++targetIndex;
}
else{
temp[targetIndex] = a[index2];
++index2;
++sortedSize;
++targetIndex;
}
}
}
__syncthreads();
}
int main()
{
int *a;
int *dev_a;
int *dev_temp;
a = (int *)malloc(N*sizeof(int));
cudaMalloc((void **)&dev_a, N*sizeof(int));
cudaMalloc((void **)&dev_temp, N*sizeof(int));
srand(time(NULL));
for (int i = 0; i < N; i++)
{
int num = rand() % 100;
a[i] = num;
}
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
myKernel << <P + 1, 1024 >> >(dev_a, dev_temp,4);
cudaMemcpy(a, dev_temp, N*sizeof(int), cudaMemcpyDeviceToHost);
i had allocated memory(array) on device inside main, and i passed it through kernel call.my program runs but not with the expected result.
when the kernel not using the ‘for loop’, it works well
but with wrong result with ‘for loop’.
from my trials on this program, I found that at each iteration of the ‘for loop’ must use an empty ‘temp’ array ‘as if it passed for the first time though the kernel call’.
how can i free temp at end of for loop before the next iteration??