I tried:
- using shared memory assigned through the 3rd kernel launch template argument
-
align(128)
Neither of these gives the expected number of kilobytes transferred…
Printing the starting offset of a shared variable within the block’s shared memory space (as you suggested) gives $00000000, so it seems aligned. I also tried casting the memory location to a unsigned long long (before I received your suggestion) and that gives 16777216, which is divisible by 128.
Could this be a device-specific issue? If so, I could get another device, but which one? (I wouldn’t like to spend much more than USD 100, though).
Code:
#include <stdio.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda.h>
__device__ unsigned int __forceinline__ shared_ptr_32bit(void *global)
{
unsigned long long sharedptr;
asm(" cvta.to.shared.u64 %0,%1;\n\t"
: "=l"(sharedptr) : "l" (global));
return (unsigned int)sharedptr;
}
__global__ void
dot_product(float * x_g, float * y_g, int x_len, int y_len, float* dp_out, unsigned long long int* xy_ptr) {
//Option 1:
//__shared__ __align__(128) float x[256];
//__shared__ __align__(128) float y[256];
//Option 2:
extern __shared__ float xy[];
float* x = &xy[0];
float* y = &xy[256];
for (int t = threadIdx.x; t < 256; t += blockDim.x) {
x[t] = x_g[t];
y[t] = y_g[t];
}
__syncthreads();
//Dot product
float dp = 0;
for (int t = threadIdx.x; t < 256; t += blockDim.x) {
dp += x[t]*y[t];
}
dp_out[blockDim.x*blockIdx.x + threadIdx.x] = dp;
printf("&xy[0] = $%08x\n", shared_ptr_32bit(&xy[0]));
if (threadIdx.x == 0)
*xy_ptr = (unsigned long long int)xy;
__syncthreads();
}
/**
* Host main routine
*/
int
main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
int threadsPerBlock = 256;
int blocksPerGrid = 25;
size_t x_size = sizeof(float)*256;
size_t y_size = sizeof(float)*256;
size_t dp_size = sizeof(float)*threadsPerBlock*blocksPerGrid;
size_t xy_ptr_size = sizeof(unsigned long long int);
// Allocate the host input vectors
float* h_x = (float*)malloc(x_size);
float* h_y = (float*)malloc(y_size);
float* h_dp = (float*)malloc(dp_size);
unsigned long long int* h_xy_ptr = (unsigned long long int*)malloc(xy_ptr_size);
// Verify that allocations succeeded
if (h_x == NULL || h_y == NULL || h_dp == NULL)
{
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int t = 0; t < 256; t++) {
h_x[t] = (float)t / 256;
}
for (int t = 0; t < 256; t++) {
h_y[t] = (float)t / 256;
}
// Allocate the device input vectors
float* d_x = NULL;
err = cudaMalloc((void **)&d_x, x_size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector d_x (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
float* d_y = NULL;
err = cudaMalloc((void **)&d_y, y_size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector d_y (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
float* d_dp = NULL;
err = cudaMalloc((void **)&d_dp, dp_size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector d_dp (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
unsigned long long int* d_xy_ptr = NULL;
err = cudaMalloc((void **)&d_xy_ptr, xy_ptr_size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector d_xy_ptr (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the host input data from host memory to the device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_x, h_x, x_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector h_x from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_y, h_y, y_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector h_y from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the CUDA Kernel
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
//Option 1:
//dot_product << <blocksPerGrid, threadsPerBlock >> > (d_x, d_y, 256, 256, d_dp);
//Option 2:
size_t sharedMemorySize = x_size + y_size;
dot_product << <blocksPerGrid, threadsPerBlock, sharedMemorySize>> > (d_x, d_y, 256, 256, d_dp, d_xy_ptr);
cudaDeviceSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_dp, d_dp, dp_size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector d_dp from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(h_xy_ptr, d_xy_ptr, xy_ptr_size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector d_xy_ptr from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Verify that the result vector is correct
for (int i = 0; i < 256; ++i)
{
fprintf(stderr, "dp[%d]=%f\n", i, h_dp[i]);
}
fprintf(stderr, "xy_ptr=%llu\n", *h_xy_ptr);
// Free device global memory
err = cudaFree(d_x);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector d_x (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_y);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector d_y (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_dp);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector d_dp (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_xy_ptr);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector d_xy_ptr (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
free(h_x);
free(h_y);
free(h_dp);
free(h_xy_ptr);
printf("Done");
return 0;
}