Hi, I tried to do some Tests with shared Memory. This simple Code should give me a Array in the convers Order, but it gives me 9 1 2 3 4 5 6 7 8 9, not 9 8 7 6 5 4 3 2 1 0.
Where is the Problem? Is the Indexing in the global function wrong?
Thanks
[codebox]#include <stdio.h>
void checkCUDAError() {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("Cuda error: %s.\n",cudaGetErrorString( err) );
exit(false);
}
}
global void reverseArrayBlock(int *devArray) {
extern __shared__ int sData[];
// Copy Data in converse Order into shared Memory
sData[blockDim.x - 1 - threadIdx.x] = devArray[threadIdx.x];
// Synchronize Threads
__syncthreads();
//Copy Data from Shared to global Memory
devArray[threadIdx.x] = sData[threadIdx.x];
}
int main( int argc, char** argv)
{
int arraySize = 10;
int numThreadsPerBlock = 10;
int numBlocks = 1;
// Fill Array with 0,1,2,3,4,5,6,7,8,9
int hostArray[arraySize];
for (int i = 0; i < arraySize; i++){
hostArray[i] = i;
printf("%i ", hostArray[i]);
}
printf("\n");
// Allocate Memory on Device
int *devArray;
cudaMalloc((void**) &devArray, sizeof(hostArray));
checkCUDAError();
// Copy array to Device
cudaMemcpy(devArray, hostArray, sizeof(hostArray), cudaMemcpyHostToDevice);
checkCUDAError();
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock<<< dimGrid, dimBlock, sizeof(devArray)>>>(devArray);
checkCUDAError();
// Synchronize Threads
cudaThreadSynchronize();
checkCUDAError();
// Copy Result into hostArray
cudaMemcpy(hostArray, devArray, sizeof(devArray), cudaMemcpyDeviceToHost);
checkCUDAError();
// Print result
for (int i = 0; i < arraySize; i++){
printf("%i ", hostArray[i]);
}
printf("\n");
// Free Memory
cudaFree(devArray);
printf("End!\n"); return 0;
}[/codebox]