¿bug? __constant__ var missalign my __device__ var

Hi!

I had a problem in my program, when accessing images previously stored, so I’ve write a small one with just the basic things.

I have a device var, that is a float4 pointer.

I also have a function to store data into that var, and a function to load that data. I’m using a device var because I’m calling both function separately from C++, as a extern functions, so my intention is to declare it as a “global” memory, persistent from both calls.

What I’ve discovered is that, if I had some other data into the “global space”, as for example a constant var, or another device var, when I read the image it’s data is disordered, some of the last columns of pixels are been moved to the start of the image or vice versa.

All that is because I’m writing the image directly on the kernel, passing the direction as argument with " cudaGetSymbolAddress() " function. In my opinion that should be enought for writing the device var without problems.

I’ve realised that, if instead of that I declare a tipical device pointer for working inside the kernel and after that, I use cudaMemcpyToSymbol() to “update” the device var, I don’t have that problem, but I don’t want to make my application slower with that memcpy, if the other should work.

¿Anyone can explainme why happens the missalign of the image?

Here’s my code.

Don’t worry about the image float4 pitch, becuse I’ve realised that the index is exactly the same as for the unsigned char images (taking into account that float4_pitch/sizeof(float4) = uchar_pitch / sizeof(uchar) because the image width is the same for both cases).

I’m sure the issue is not here, because I’ve wrote a lot of code with that way of indexing, and it works fine. Also for that code, if we change the cudaGetSymbolAddress for the cudaMemcpyToSymbol as I explained before.

Thanks for your help.

-KaiK-

[codebox]

#include <cuda.h>

#include <stdio.h>

#include <imageplus/foreground2d/foreground2d_cuda.h>

#define THREADS_X 16

#define THREADS_Y 22

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err) 

{

    printf("Cuda error: %s: %s.\n", msg, 

                         cudaGetErrorString( err) );

    exit(EXIT_FAILURE);

}                         

}

//global vars

_constant size_t PITCH; //or whatever

device float4* image;

////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////GLOBAL FUNCTIONS////////////////////////////

////////////////////////////////////////////////////////////////////////////////////////////////////

//creates a float4 vector with the r, g, and b values of each pixel

global void set_image_kernel(unsigned char *r, unsigned char *g, unsigned char b, size_t pitch_in, float4output, int2 size){

int2 thread = make_int2(blockDim.x * blockIdx.x + threadIdx.x, blockDim.y * blockIdx.y + threadIdx.y);



//comprobamos que no estemos fuera de la zona de la imagen definida

if(thread.x < size.x && thread.y < size.y)

{

	int index = thread.y * pitch_in/sizeof(unsigned char) + thread.x;

	float4 rgb_var;

	

	rgb_var.x = (float)r[index];

	rgb_var.y = (float)g[index];

	rgb_var.z = (float)b[index];

// index = thread.y * PITCH/sizeof(float4) + thread.x;

	output[index] = (float4) rgb_var;

	

}

}

global void read_image_kernel(float4*input, unsigned char *r, unsigned char *g, unsigned char *b, size_t pitch_out, int2 size){

int2 thread = make_int2(blockDim.x * blockIdx.x + threadIdx.x, blockDim.y * blockIdx.y + threadIdx.y);



//comprobamos que no estemos fuera de la zona de la imagen definida

if(thread.x < size.x && thread.y < size.y)

{

// int index = thread.y * PITCH/sizeof(float4) + thread.x;

	int index = thread.y * pitch_out/sizeof(unsigned char) + thread.x;

	float4 rgb_var = input[index];

	

	

	r[index] = (unsigned char)rgb_var.x;

	g[index] = (unsigned char)rgb_var.y;

	b[index] = (unsigned char)rgb_var.z;

	

}

}

////////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////HOST FUNCTIONS//////////////////////////////

////////////////////////////////////////////////////////////////////////////////////////////////////

host void set_image_data(unsigned char *r_in, unsigned char *g_in, unsigned char *b_in, int2 size,dim3 dimBlock, dim3 dimGrid)

{

//gets the image data, and turns it into a float4 array.

float4 *image_dev;

cudaGetSymbolAddress((void**)&image_dev, image);

unsigned char *r_dev;

unsigned char *g_dev;

unsigned char *b_dev;

size_t pitch_r_g_b;

size_t pitch_rgb;

cudaMallocPitch((void **) &image_dev, &pitch_rgb, size.x*sizeof(float4), size.y);

cudaMallocPitch((void **) &r_dev, &pitch_r_g_b, size.x*sizeof(unsigned char), size.y);

cudaMallocPitch((void **) &g_dev, &pitch_r_g_b, size.x*sizeof(unsigned char), size.y);

cudaMallocPitch((void **) &b_dev, &pitch_r_g_b, size.x*sizeof(unsigned char), size.y);

checkCUDAError("mallocs de image & r, g, b");



cudaMemcpy2D(r_dev, pitch_r_g_b, r_in, size.x, size.x*sizeof(unsigned char), size.y, cudaMemcpyHostToDevice);

cudaMemcpy2D(g_dev, pitch_r_g_b, g_in, size.x, size.x*sizeof(unsigned char), size.y, cudaMemcpyHostToDevice);

cudaMemcpy2D(b_dev, pitch_r_g_b, b_in, size.x, size.x*sizeof(unsigned char), size.y, cudaMemcpyHostToDevice);

checkCUDAError("memcpy de r, g, b");

// cudaMemcpyToSymbol(PITCH, &pitch_rgb, sizeof(size_t), 0, cudaMemcpyHostToDevice);

// checkCUDAError(“invalid memcpy to simbol”);

set_image_kernel <<<dimGrid, dimBlock>>> (r_dev, g_dev, b_dev, pitch_r_g_b, image_dev, size);

checkCUDAError("kernel set image data");

//The following is what works ok, but deleting the line "cudaGetSymbolAddress"

// cudaMemcpyToSymbol(image, image_dev, size.xsize.ysizeof(float4), 0, cudaMemcpyDeviceToDevice);

// checkCUDAError(“memcopy de float4 a device”);

cudaFree(r_dev);

cudaFree(g_dev);

cudaFree(b_dev);

checkCUDAError("free image_rbg_dev & r, g, b");

}

host void read_image_data(unsigned char *r_out, unsigned char *g_out, unsigned char *b_out, int2 size, dim3 dimBlock, dim3 dimGrid)

{

//gets the image data, and turns it into a float4 array.

float4 *image_dev;

cudaGetSymbolAddress((void**)&image_dev, image);	



unsigned char *r_dev;

unsigned char *g_dev;

unsigned char *b_dev;

size_t pitch_r_g_b;



cudaMallocPitch((void **) &r_dev, &pitch_r_g_b, size.x*sizeof(unsigned char), size.y);

cudaMallocPitch((void **) &g_dev, &pitch_r_g_b, size.x*sizeof(unsigned char), size.y);

cudaMallocPitch((void **) &b_dev, &pitch_r_g_b, size.x*sizeof(unsigned char), size.y);

checkCUDAError("mallocs de image & r, g, b");



read_image_kernel <<<dimGrid, dimBlock>>> (image_dev, r_dev, g_dev, b_dev, pitch_r_g_b, size);

checkCUDAError("kernel set image data");





cudaMemcpy2D(r_out, size.x, r_dev, pitch_r_g_b, size.x*sizeof(unsigned char), size.y, cudaMemcpyDeviceToHost);

cudaMemcpy2D(g_out, size.x, g_dev, pitch_r_g_b, size.x*sizeof(unsigned char), size.y, cudaMemcpyDeviceToHost);

cudaMemcpy2D(b_out, size.x, b_dev, pitch_r_g_b, size.x*sizeof(unsigned char), size.y, cudaMemcpyDeviceToHost);

checkCUDAError("memcpy de r, g, b");

cudaFree(r_dev);

cudaFree(g_dev);

cudaFree(b_dev);

checkCUDAError("free image_rbg_dev & r, g, b");

}

////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////EXTERN C FUNCTIONS/////////////////////////

////////////////////////////////////////////////////////////////////////////////////////////////////

extern “C” void save( unsigned char *r_in, unsigned char *g_in, unsigned char *b_in, int im_width, int im_height)

{

int2 size = make_int2(im_width, im_height);

//block and grid dimensions

dim3 dimBlock(THREADS_X, THREADS_Y);

int blocks_x = im_width / THREADS_X;

int blocks_y = im_height / THREADS_Y;

if(im_width%THREADS_X !=0) blocks_x++;

if(im_height%THREADS_Y !=0) blocks_y++;

dim3 dimGrid(blocks_x, blocks_y);

//mean texture

set_image_data(r_in, g_in, b_in, size, dimBlock, dimGrid);



printf("image set\n");	

}

extern “C” void load(unsigned char *r_out, unsigned char *g_out, unsigned char *b_out ,int im_width, int im_height)

{

int2 size = make_int2(im_width, im_height);

//block and grid dimensions

dim3 dimBlock(THREADS_X, THREADS_Y);

int blocks_x = im_width / THREADS_X;

int blocks_y = im_height / THREADS_Y;

if(im_width%THREADS_X !=0) blocks_x++;

if(im_height%THREADS_Y !=0) blocks_y++;

dim3 dimGrid(blocks_x, blocks_y);

//mean texture

read_image_data(r_out, g_out, b_out, size, dimBlock, dimGrid);



printf("image read done\n");	

}

[/codebox]