“vectorAdd2” (0.74 ms) 13.5x faster than “vectorAdd1” (10 ms) . this is gpgpu limits.
but another talk that only 30% differences (1.01 ms vs 1.28 ms). so, test this code, please !
#include <stdio.h>
#include <helper_math.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
texture<float4, 1, cudaReadModeElementType> tex1, tex2, tex3; // the data store in a 1D float4 texture
/**
* CUDA Kernel Device code
*
* Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements.
*/
__global__ void
vectorAdd1(const float4 *A, const float4 *B, const float4 *D, float4 *C, int * Ai, int * Bi, int * Di, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
for ( int j=0; j<numElements; ++j ) {// 4000
C[i] = A[ Ai[j] ] + B[ Bi[j] ] + D[ Di[j] ];
}
}
}
__global__ void
vectorAdd2(float4 *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
for ( int j=0; j<numElements; ++j ) { // 4000 //
float4 Ai = tex1Dfetch( tex1, j );
float4 Bi = tex1Dfetch( tex2, j );
float4 Di = tex1Dfetch( tex3, j );
C[i] = Ai + Bi + Di;
}
}
}
struct Data {
float4 *d_A;
float4 *h_A;
Data( int num_elements ) {
const int size = num_elements * sizeof ( float4 );
h_A = (float4 *)malloc( size);
for (int i = 0; i < num_elements; ++i)
{
h_A[i] = make_float4(
rand()/(float)RAND_MAX, rand()/(float)RAND_MAX,
rand()/(float)RAND_MAX, rand()/(float)RAND_MAX );
}
cudaMalloc((void **)&d_A, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
} ;
~Data() {
cudaFree(d_A);
free(h_A);
} ;
} ;
struct Data_i {
int *d_A;
int *h_A;
Data_i( int num_elements ) {
const int size = num_elements * sizeof ( int );
h_A = (int *)malloc( size);
for (int i = 0; i < num_elements; ++i)
{
h_A[i] = rand() % num_elements;
}
cudaMalloc((void **)&d_A, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
} ;
~Data_i() {
cudaFree(d_A);
free(h_A);
} ;
} ;
//float4 get_Random_Float4_CUDA_Ptr ( int num_elements );
void bindTex( texture<float4, 1, cudaReadModeElementType> & tex, float4 * verts, int num_verts );
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 5000;
Data d_A( numElements ), d_B( numElements ), d_C( numElements ), d_D( numElements ) ;
Data_i d_Ai(numElements), d_Bi(numElements), d_Di(numElements);
bindTex ( tex1, d_A.d_A, numElements );
bindTex ( tex2, d_B.d_A, numElements );
bindTex ( tex3, d_D.d_A, numElements );
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements/4 + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
cudaEventRecord(start);
vectorAdd1<<<blocksPerGrid, threadsPerBlock>>>(d_A.d_A, d_B.d_A, d_D.d_A, d_C.d_A, d_Ai.d_A, d_Bi.d_A, d_Di.d_A, numElements/4);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf ("\nTime for the kernel: %f ms\n", milliseconds );
cudaEventRecord(start);
vectorAdd2<<<blocksPerGrid, threadsPerBlock>>>(d_C.d_A, numElements/4);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf ("\nTime for the kernel (tex1DFetch): %f ms\n", milliseconds );
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Reset the device and exit
err = cudaDeviceReset();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Done\n");
getchar();
return 0;
}
void bindTex( texture<float4, 1, cudaReadModeElementType> & tex, float4 * verts, int num_verts )
{
tex.normalized = false; // access with normalized texture coordinates
tex.filterMode = cudaFilterModePoint; // Point mode, so no
tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
size_t size = sizeof(float4)*num_verts;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,tex,verts,channelDesc,size);
}