help me to implement kernel function(element-wise product), size 1024*1024

Hi!!

I am not good to construct kernel function…

Now, Im trying to build a kernel function for element-wise product x vector and y vector.

However, the problem is that x’s type is cufftComplex, so x has real and imaginary value
and y is only integer.

For example, x --> (3+2i),(2+4i), (10+7i)
y --> 2, 5, 3

     result: 6+4i,  10+20i, 30+21i

I know that it is really simple but I confuse every time how thread index will be…
Also, it is pretty big size.
the number of x and y are 1024*1024 in respectively.
(Additionally, I think my program dies
on checkCudaErrors(cudaMemcpy(x_w, x_t, _size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(h_w, h_t, _size, cudaMemcpyHostToDevice));

I assume that the size is too big…? is that right??? Is 1024*1024 too big?)

I attach the part of my code below (It doesn’t work well…)
and I’ve checked 3 main problems.

if anyone want to know full code, please tell me!

//////////////////////////////////////////////////////////////
//////////////this part is the problem 1//////////////
global void point_wise_product(cufftComplex *a, int b){
int tid = threadIdx.x;
if(tid<SIGNAL_SIZE
SIGNAL_SIZE)
a[tid].x = a[tid].x * b[tid];
a[tid].y = a[tid].y * b[tid];
}

#define SIGNAL_SIZE 1024*1024

int main()
{

//skip for creation of h_t values

cufftComplex *x_t;	

//Allocate host memory for the x(t)
x_t = (cufftComplex *)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);

//Allocate host memory for the result(t)
cufftComplex *result = (cufftComplex *)malloc(_size);
cufftComplex *final_result = (cufftComplex *)malloc(_size);

//Initialize the memory for the signal
for(unsigned int i = 0; i < SIGNAL_SIZE; i++)
{
	x_t[i].x = rand()/(float)RAND_MAX * 100;
	x_t[i].y = 0;
	//printf("host signal: %f\n", x_t[i].x);
}

//Allocate device memory for signal
cufftComplex *x_w;
checkCudaErrors(cudaMalloc((void **)&x_w, _size));

//Allocate device memory for h
int *h_w;
checkCudaErrors(cudaMalloc((void **)&h_w, _size));

    //////////////////////////////////////////////////////////////////////////////
    //////////////**this part is the problem 2**//////////////////////////////
//Copy host memory to device
checkCudaErrors(cudaMemcpy(x_w, x_t, _size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(h_w, h_t, _size, cudaMemcpyHostToDevice));   

      .....fft.....

    //////////////////////////////////////////////////////////////////////////////
    //////////////**this part is the problem 3**//////////////////////////////
    dim3 dimGrids(SIGNAL_SIZE/TILE_WIDTH, SIGNAL_SIZE/TILE_WIDTH);
dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH);

//Multiply the coefficients tohether and normalize the result
printf("Launching ComplexPointwiseAndScale<<< >>>\n");
point_wise_product<<<dimGrids, dimBlocks>>>(x_w, h_w);


}

Yes, provide full code.
No, 1024*1024 is not too big.

When you place code in this forum, it’s helpful if you:

  1. indent properly for easier reading
  2. use the code marker in the toolbar - it is the </> symbol. That is highlight the code you have just pasted into this forum, and then click the </> symbol above, before sending your reply.

I’m really sorry about late reply.
below is the full code.

I think that my kernel function’s tid is too bad…

#define SIGNAL_SIZE 1024*1024
#define TILE_WIDTH 16

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iomanip>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include <windows.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>

__global__ void point_wise_product(cufftComplex *a, int *b){
	int tid =  threadIdx.x;
	if(tid<SIGNAL_SIZE*SIGNAL_SIZE)
		a[tid].x = a[tid].x * b[tid];
		a[tid].y = a[tid].y * b[tid];
}

int main(int argc, char **argv)
{
	cudaError_t err;
	//return device ID
	//findCudaDevice(argc, (const char **)argv);

	int _size = sizeof(cufftComplex) * SIGNAL_SIZE;

	LARGE_INTEGER liCounter1, liCounter2, liFrequency;
	QueryPerformanceFrequency(&liFrequency);

	//h_t values
	int *h_t;
	h_t = (int *)malloc(sizeof(int) * SIGNAL_SIZE);
	h_t[0] = 1;
	h_t[SIGNAL_SIZE/2] = 1;

	for(unsigned int i = 0; i < SIGNAL_SIZE/2 - 1; i++)
	{
		h_t[i+1] = 2;
		h_t[SIGNAL_SIZE/2 + 1 + i] = 0;
	}

	
	//cufftReal *x_t;	
	cufftComplex *x_t;	

	//Allocate host memory for the x(t)
	x_t = (cufftComplex *)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
	
	//Allocate host memory for the result(t)
	cufftComplex *result = (cufftComplex *)malloc(_size);
	cufftComplex *final_result = (cufftComplex *)malloc(_size);

	//Initialize the memory for the signal
	for(unsigned int i = 0; i < SIGNAL_SIZE; i++)
	{
		x_t[i].x = rand()/(float)RAND_MAX * 100;
		x_t[i].y = 0;
		//printf("host signal: %f\n", x_t[i].x);
	}

	//Allocate device memory for signal
	cufftComplex *x_w;
	checkCudaErrors(cudaMalloc((void **)&x_w, _size));

	//Allocate device memory for h
	int *h_w;
	err = cudaMalloc((void **)&h_w, _size);
	printf("%d , %d  ..malloc ", cudaSuccess, err);
	
	//Copy host memory to device
	err = cudaMemcpy(x_w, x_t, _size, cudaMemcpyHostToDevice);
	printf("%d , %d  .. ", cudaSuccess, err);

	Sleep(3);

	err = cudaMemcpy(h_w, h_t, _size, cudaMemcpyHostToDevice);
	printf("%d , %d", cudaSuccess, err);

	//cufft plan
	cufftHandle plan;
	checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));

	QueryPerformanceCounter(&liCounter1);
	//transform signal
	printf("Transforming signal cufftexecR2C\n");
	checkCudaErrors(cufftExecC2C(plan, x_w, x_w, CUFFT_FORWARD));

	dim3 dimGrids(SIGNAL_SIZE/TILE_WIDTH, SIGNAL_SIZE/TILE_WIDTH);
	dim3 dimBlocks(TILE_WIDTH, TILE_WIDTH);

	//Multiply the coefficients tohether and normalize the result
	printf("Launching ComplexPointwiseAndScale<<< >>>\n");
	point_wise_product<<<dimGrids, dimBlocks>>>(x_w, h_w);

	//Copy device memory to host
	checkCudaErrors(cudaMemcpy(result, x_w, _size, cudaMemcpyDeviceToHost));
			
	//transform signal
	printf("Transforming signal cufftexecC2C\n");
	cufftHandle plan2;
	checkCudaErrors(cufftPlan1d(&plan2, SIGNAL_SIZE, CUFFT_C2C, 1));	
	checkCudaErrors(cufftExecC2C(plan2, (cufftComplex *)x_w, (cufftComplex *)x_w, 1));
	QueryPerformanceCounter(&liCounter2);

	//Copy device memory to host
	printf("Copy device memory to host\n");
	checkCudaErrors(cudaMemcpy(final_result, x_w, _size, cudaMemcpyDeviceToHost));

	cufftDestroy(plan);
	cufftDestroy(plan2);
	free(x_t);
	cudaFree(x_w);
	cudaFree(h_w);
	free(result);
	free(final_result);
}

int tid = (blockDim.x * blockIdx.x) + threadIdx.x;

if(tid<SIGNAL_SIZE)
{
a[tid].x = a[tid].x * b[tid];
a[tid].y = a[tid].y * b[tid];
}

dim3 dimGrids((SIGNAL_SIZE / 128), 1, 1);
dim3 dimBlocks(128, 1, 1);

little_jimmy

thanks for you reply!

I changed my code and attached it below.

it’s okay when signal_size is 128128
however, if signal_size modifies to 1024
1024, it didn’t work.

also, program dead point in my code is attached below too.

#define SIGNAL_SIZE 128*128
#define TILE_WIDTH 128

__global__ void point_wise_product(cufftComplex *a, int *b){

	const int Row = blockIdx.y*TILE_WIDTH + threadIdx.y;
	const int Col = blockIdx.x*TILE_WIDTH + threadIdx.x;
	const int numThreads = SIGNAL_SIZE*SIGNAL_SIZE;

	if(Row < SIGNAL_SIZE && Col < SIGNAL_SIZE){
		a[Row*TILE_WIDTH + Col].x = a[Row*TILE_WIDTH + Col].x * b[Row*TILE_WIDTH + Col];
		a[Row*TILE_WIDTH + Col].y = a[Row*TILE_WIDTH + Col].y * b[Row*TILE_WIDTH + Col];
	}
	
}

dead point:
//Copy host memory to device

#define SIGNAL_SIZE 1024*1024
       int _size = sizeof(cufftComplex) * SIGNAL_SIZE;

       err = cudaMemcpy(x_w, x_t, _size, cudaMemcpyHostToDevice);
	printf("%d , %d  .. ", cudaSuccess, err);	

        //here is the dead point. It never returns any err msg.
	err = cudaMemcpy(h_w, h_t, _size, cudaMemcpyHostToDevice);
	printf("%d , %d", cudaSuccess, err);

does anyone know why this program died?

i fail to comprehend why you need to use tiles, given that you seem to be using ‘flat’ arrays, in a ‘flat’ manner
the additional dimensions of the tile add no value, and rather seems to be a point of confusion

and i fail to comprehend why you need more threads than array elements

if(tid<SIGNAL_SIZE*SIGNAL_SIZE)

and

const int numThreads = SIGNAL_SIZE*SIGNAL_SIZE;

you did not show the block/ grid dimension allocation of the above kernel
in essence, what are dimGrids and dimBlocks assigned to for the above kernel
how you assign the block/ grid dimensions and how you use threads of the block would determine whether the code would work or not

Oh, I found the problem… I had a mistake for h_w size.

when I allocate device memory for h_w, the size was int _size = sizeof(cufftComplex) * SIGNAL_SIZE
However, h_w size should be int _size = sizeof(int) * SIGNAL_SIZE, and this problem was solved!

Thanks guys!