char array - my mistake?

I want to edit bitmap using CUDA. So I have:

unsigned char *pdata;

unsigned char *pdataGPU;

int numPixels=1280*720;

pdata=new unsigned char[3*numPixels];

size=numPixels*sizeof(unsigned char)*3;

cudaMalloc((void **) &pdataGPU, size);

//copying etc.:

cudaMemcpy(pdataGPU, pdata, size, cudaMemcpyHostToDevice);

cudaThreadSynchronize();

//calculating:

frameGPUeffect1(img1->pdataGPU,1280*720);

//copying to main mem:

cudaMemcpy(pdata, pdataGPU, size, cudaMemcpyDeviceToHost);

cudaThreadSynchronize();

//and functions:

__global__ void CUDAframeGPUeffect1(unsigned char* pdataGPU,int max)

{

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if(idx<max)

	{

		pdataGPU[3*idx]=pdataGPU[3*idx];

		pdataGPU[3*idx+1]=0;

		pdataGPU[3*idx+2]=pdataGPU[3*idx+2];

	}

}

extern "C"

void frameGPUeffect1(unsigned char* pdataGPU,int max)

{

	int size=720;

	int block_size = 2;

	int n_blocks = max/block_size + (size%block_size == 0 ? 0:1);

	printf("\nGPU\n");

	CUDAframeGPUeffect1 <<< n_blocks, block_size >>> (pdataGPU,max);

}

But here’s no difference between input and output - I have the same image.

I want to edit bitmap using CUDA. So I have:

unsigned char *pdata;

unsigned char *pdataGPU;

int numPixels=1280*720;

pdata=new unsigned char[3*numPixels];

size=numPixels*sizeof(unsigned char)*3;

cudaMalloc((void **) &pdataGPU, size);

//copying etc.:

cudaMemcpy(pdataGPU, pdata, size, cudaMemcpyHostToDevice);

cudaThreadSynchronize();

//calculating:

frameGPUeffect1(img1->pdataGPU,1280*720);

//copying to main mem:

cudaMemcpy(pdata, pdataGPU, size, cudaMemcpyDeviceToHost);

cudaThreadSynchronize();

//and functions:

__global__ void CUDAframeGPUeffect1(unsigned char* pdataGPU,int max)

{

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	if(idx<max)

	{

		pdataGPU[3*idx]=pdataGPU[3*idx];

		pdataGPU[3*idx+1]=0;

		pdataGPU[3*idx+2]=pdataGPU[3*idx+2];

	}

}

extern "C"

void frameGPUeffect1(unsigned char* pdataGPU,int max)

{

	int size=720;

	int block_size = 2;

	int n_blocks = max/block_size + (size%block_size == 0 ? 0:1);

	printf("\nGPU\n");

	CUDAframeGPUeffect1 <<< n_blocks, block_size >>> (pdataGPU,max);

}

But here’s no difference between input and output - I have the same image.

Hi!

You have several errors:
-> first: your blocksize is 2??? it’s very strange;
-> second: 1280*720 = 921600/2 = 460800 blocks??? Are you sure? You have to check the limit of the grid. In the CCs 1.0,1.1,1.2, 1.3 and 2.0 is 65535 blocks. If you exceeds this limit, your execution does not fail but the kernel is not executed.

Regards!

Hi!

You have several errors:
-> first: your blocksize is 2??? it’s very strange;
-> second: 1280*720 = 921600/2 = 460800 blocks??? Are you sure? You have to check the limit of the grid. In the CCs 1.0,1.1,1.2, 1.3 and 2.0 is 65535 blocks. If you exceeds this limit, your execution does not fail but the kernel is not executed.

Regards!

Ahh, thanks - I increased block size and now it works.

Ahh, thanks - I increased block size and now it works.

can i ask something?

why do you multiply by 3 in the size and the index?
what does it means?

execuse me if it is evident because i’m a beginner in CUDA

can i ask something?

why do you multiply by 3 in the size and the index?
what does it means?

execuse me if it is evident because i’m a beginner in CUDA

Each pixel in this example would be 24bits (8bits each for Red, Green and Blue), so three chars are needed per pixel to represent the data.

Each pixel in this example would be 24bits (8bits each for Red, Green and Blue), so three chars are needed per pixel to represent the data.

this code is about the longest common subsequence problem

__global__ void LCS( int* C, char* A, char* B, int wA, int wB)

{

int current_Index = blockIdx.x * blockDim.x + thread_Id;

int wC=wA+1;

int j,i;

int mn=0;

if(wA>wB)

	{

   	

		//for(k=0;k<=wA;k++){

			if (current_Index<=wA)

			{

				j=current_Index;

				i=0;

				mn=min(current_Index,wB);

				while((i<=mn)||(j>0))

				{

					if (i==0 && j==0)

					C[i*wC+j]=0;

					if (A[current_Index-1]==B[current_Index-1])

						C[i*wC+j]=C[(i-1)*wC+j-1]+1;

						

					else

						C[i*wC+j]=max(C[(i-1)*wC+j],C[i*wC+j-1]); 

						

					if (i<=mn)

						i=i+1;

					if (j>0)

						j=j+1;

			}

			__syncthreads();

		}

		

			current_Index = blockIdx.x * blockDim.x + thread_Id+1;;

			if(current_Index<=wB)

			{

				j=wA;

				i=current_Index;

				while(i<=wB)

				{

					if (A[current_Index-1]==B[current_Index-1])

						C[i*wC+j]=C[(i-1)*wC+j-1]+1;

					else

					C[i*wC+j]=max(C[(i-1)*wC+j],C[i*wC+j-1]);

					i=i+1;

					j=j-1;

				}

			__syncthreads();

			}

(the parallelism exists in the elements of the antidiagonal)

Which is equivalent to

for (i=0;i<N+1;i++)

C[i][0]=0;

for (j=1;j<M+1;j++)

C[0][j]=0;

for (i=1;i<N+1;i++)

	for(j=1;j<M+1;j++)

	{

			if (X[j-1]==Y[i-1])

			C[i][j]=C[i-1][j-1]+1;

			else

			C[i][j]=max(C[i-1][j],C[i][j-1]);

	}

in C

block_size=16

wA= 2*Block_size //dimension of the array of char A

wB= bloch_size // dimension of the array of char B

when i execute the kernel i get wrong result in the matrix C (the matrix result of the dynamic programming of LCS)

Can you help me to find the error? if it is in the index of matrix and char’s array or either

this code is about the longest common subsequence problem

__global__ void LCS( int* C, char* A, char* B, int wA, int wB)

{

int current_Index = blockIdx.x * blockDim.x + thread_Id;

int wC=wA+1;

int j,i;

int mn=0;

if(wA>wB)

	{

   	

		//for(k=0;k<=wA;k++){

			if (current_Index<=wA)

			{

				j=current_Index;

				i=0;

				mn=min(current_Index,wB);

				while((i<=mn)||(j>0))

				{

					if (i==0 && j==0)

					C[i*wC+j]=0;

					if (A[current_Index-1]==B[current_Index-1])

						C[i*wC+j]=C[(i-1)*wC+j-1]+1;

						

					else

						C[i*wC+j]=max(C[(i-1)*wC+j],C[i*wC+j-1]); 

						

					if (i<=mn)

						i=i+1;

					if (j>0)

						j=j+1;

			}

			__syncthreads();

		}

		

			current_Index = blockIdx.x * blockDim.x + thread_Id+1;;

			if(current_Index<=wB)

			{

				j=wA;

				i=current_Index;

				while(i<=wB)

				{

					if (A[current_Index-1]==B[current_Index-1])

						C[i*wC+j]=C[(i-1)*wC+j-1]+1;

					else

					C[i*wC+j]=max(C[(i-1)*wC+j],C[i*wC+j-1]);

					i=i+1;

					j=j-1;

				}

			__syncthreads();

			}

(the parallelism exists in the elements of the antidiagonal)

Which is equivalent to

for (i=0;i<N+1;i++)

C[i][0]=0;

for (j=1;j<M+1;j++)

C[0][j]=0;

for (i=1;i<N+1;i++)

	for(j=1;j<M+1;j++)

	{

			if (X[j-1]==Y[i-1])

			C[i][j]=C[i-1][j-1]+1;

			else

			C[i][j]=max(C[i-1][j],C[i][j-1]);

	}

in C

block_size=16

wA= 2*Block_size //dimension of the array of char A

wB= bloch_size // dimension of the array of char B

when i execute the kernel i get wrong result in the matrix C (the matrix result of the dynamic programming of LCS)

Can you help me to find the error? if it is in the index of matrix and char’s array or either

Hi!
Well, i have any doubts about your code:
-> what is thread_Id?? Perhaps you wanted to write threadIdx.x? Did not?
-> If you wanted to write threadIdx.x, you are trying to access to A[-1] = B[-1] in the sentence ‘if (A[current_Index-1]==B[current_Index-1])’
This is a bit strange.
-> 16 threads is a halfwarp, i would use 32 of blockSize. The size of blocks is always better if it is a multiple of 32.

Check this details. Regards.

Hi!
Well, i have any doubts about your code:
-> what is thread_Id?? Perhaps you wanted to write threadIdx.x? Did not?
-> If you wanted to write threadIdx.x, you are trying to access to A[-1] = B[-1] in the sentence ‘if (A[current_Index-1]==B[current_Index-1])’
This is a bit strange.
-> 16 threads is a halfwarp, i would use 32 of blockSize. The size of blocks is always better if it is a multiple of 32.

Check this details. Regards.

I change it to threadIdx.x and Block_size to 32;

and i execute it and it gives me an error ("LCS has triggered a breakpoint) in this stage (after the execution of the kernel):

LCS<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
CheckConditionXR_(cudaGetLastError() == cudaSuccess, -1);

so i get a wrong result.
is there any problem in my code in the index of the matrix result C?

I change it to threadIdx.x and Block_size to 32;

and i execute it and it gives me an error ("LCS has triggered a breakpoint) in this stage (after the execution of the kernel):

LCS<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
CheckConditionXR_(cudaGetLastError() == cudaSuccess, -1);

so i get a wrong result.
is there any problem in my code in the index of the matrix result C?

As i said before, this code ‘A[current_Index-1]==B[current_Index-1]’ when current_index = 0 (thread #0 in block #0) is attempting to
access a A[-1] and B[-1]. This is an error. The range of current_index is [0,(blockDim.x*gridDim.x-1)], is not? Then, the correct access would be A[current_index] and B[current_index].

More, if you have changed the size of blocks (16 to 32), you have to change the size of arrays and its initialization too.

As i said before, this code ‘A[current_Index-1]==B[current_Index-1]’ when current_index = 0 (thread #0 in block #0) is attempting to
access a A[-1] and B[-1]. This is an error. The range of current_index is [0,(blockDim.x*gridDim.x-1)], is not? Then, the correct access would be A[current_index] and B[current_index].

More, if you have changed the size of blocks (16 to 32), you have to change the size of arrays and its initialization too.

yes i changed it to A[current_Index] the same thing in B

the size of the arrays A and B are:

wA= 2*Block_size //dimension of the array of char A
wB= bloch_size // dimension of the array of char B

my problem is that i want to fill the matrix result anti-diagonal by anti-diagonal because in my algorithm every cell C[i][j] depends on the cell C[i-1][j-1] , C[i-1][j] and C[i][j-1]
so we can fill one anti-diagonal in the same time (in parallel)

is there any change in the index to realize this parallelism?

yes i changed it to A[current_Index] the same thing in B

the size of the arrays A and B are:

wA= 2*Block_size //dimension of the array of char A
wB= bloch_size // dimension of the array of char B

my problem is that i want to fill the matrix result anti-diagonal by anti-diagonal because in my algorithm every cell C[i][j] depends on the cell C[i-1][j-1] , C[i-1][j] and C[i][j-1]
so we can fill one anti-diagonal in the same time (in parallel)

is there any change in the index to realize this parallelism?