Parrallelize radix type counter... variable base counter

Say you have a specific base you want your number to represented in, my case is 5. I want to produce all numbers increment in the specific base rather than base 10. What I am really trying to do is produce all possible vectors of a certain dimension with a finite set of coefficients. Reason is to take the fft of each one basically all vectors in this function space. I have been looking through libraries and what not but haven’t found a way so I was wondering if anyone had advice on how to speed this up:

    int bucket[15] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
    
    while (bucket[0] != 5 && bucket[1] != 5 && bucket[2] != 5 && bucket[3] != 5 && bucket[4] != 5 && bucket[5] != 5 && bucket[6] != 5 && bucket[7] != 5 && bucket[8] != 5
           && bucket[9] != 5 && bucket[10] != 5 && bucket[11] != 5 && bucket[12] != 5 && bucket[13] != 5 && bucket[14] != 5 ){
        bucket[0]++;
        if (bucket[0] == 5) {
            bucket[0] = 0;
            bucket[1]++;
            if (bucket[1] == 5) {
                bucket[1] = 0;
                bucket[2]++;
                if (bucket[2] == 5) {
                    bucket[2] = 0;
                    bucket[3]++;
                    if (bucket[3] == 5) {
                        bucket[3] = 0;
                        bucket[4]++;
                        if (bucket[4] == 5) {
                            bucket[4] = 0;
                            bucket[5]++;
                            if (bucket[5] == 5) {
                                bucket[5] = 0;
                                bucket[6]++;
                                if (bucket[6] == 5) {
                                    bucket[6] = 0;
                                    bucket[7]++;
                                    if (bucket[7] == 5) {
                                        bucket[7] = 0;
                                        bucket[8]++;
                                        if (bucket[8] == 5) {
                                            bucket[8] = 0;
                                            bucket[9]++;
                                            if (bucket[9] == 5) {
                                                bucket[9] = 0;
                                                bucket[10]++;
                                                if (bucket[10] == 5) {
                                                    bucket[10] = 0;
                                                    bucket[11]++;
                                                    if (bucket[11] == 5) {
                                                        bucket[11] = 0;
                                                        bucket[12]++;
                                                        if (bucket[12] == 5) {
                                                            bucket[12] = 0;
                                                            bucket[13]++;
                                                            if (bucket[13] == 5) {
                                                                bucket[13] = 0;
                                                                bucket[14]++;
                                                                if (bucket[14] == 5) {
                                                                    
                                                                }
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        printf("%i %i %i %i %i %i %i %i %i %i %i %i %i %i %i\n", bucket[14],bucket[13],bucket[12],bucket[11],bucket[10],bucket[9],bucket[8],bucket[7],bucket[6],bucket[5],bucket[4],bucket[3],bucket[2],bucket[1],bucket[0]);
    }

there is obvious symmetry to it, it would be nice to do this on the device end rather than host so I dont have to copy over any… advice is appreciated!

using cuda 10 on jetson agx

but idk increments depends on the previous like clock increment does that affect its ability to be run in a grid?

https://stackoverflow.com/questions/68263564/making-base-r-vector-counting-parallel

1 Like

#include <stdio.h>      /* printf, NULL */
#include <stdlib.h>     /* strtoul */
#include <iostream>
#include <math.h>

#define a 300000000

int main() 
{
    double vec[15] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
//    for (int i = 0; i < a; i++) {
//    printf("%i, %i, %i, %i, %i, %i, %i, %i, %i, %i, %i, %i, %i, %i, %i\n",
//    vec[14] = 0.000000000032768;
    vec[14] = 0.00000000016384;
    vec[13] = 0.0000000008192;
    vec[12] = 0.000000004;
    vec[11] = 0.00000002;
    vec[10] = 0.000000102;
    vec[9] = 0.000000512;
    vec[8] = 0.00000256;
    vec[7] = 0.0000128;
    vec[6] = 0.000064;
    vec[5] = 0.00032;
    vec[4] = 0.0016;
    vec[3] = 0.008;
    vec[2] = 0.04;
    vec[1] = 0.2;
    vec[0] = 1;
    
    for (int i = 0; i < 300000000; i++) {
        printf("%f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n",
               floor(fmod(i*vec[0], 5)),
               floor(fmod(i*vec[1], 5)),
               floor(fmod(i*vec[2], 5)),
               floor(fmod(i*vec[3], 5)),
               floor(fmod(i*vec[4], 5)),
               floor(fmod(i*vec[5], 5)),
               floor(fmod(i*vec[6], 5)),
               floor(fmod(i*vec[7], 5)),
               floor(fmod(i*vec[8], 5)),
               floor(fmod(i*vec[9], 5)),
               floor(fmod(i*vec[10], 5)),
               floor(fmod(i*vec[11], 5)),
               floor(fmod(i*vec[12], 5)),
               floor(fmod(i*vec[13], 5)),
               floor(fmod(i*vec[14], 5)));
    }

}

update all math functions are in CUDA math sdk planning on using shared memory for the vec constants… will need to make the big conditional in the for 5^15 somehow

for loop will have to be changed to something like while(tid < N)

kinda cool how dividing 1 by 5^n is 2^n left shift n wouldnt have guessed lol

oh damn maybe dont need those constants

// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"
#include <cuda.h>

const int threads_per_block = 15;
//const int blocks_per_grid = 256;
//const int N = 20480;

__global__ void radix_count(float * results, int N) {
	
	__shared__ float radii[threads_per_block];
	//__shared__ float eyes[blocks_per_grid];
	radii[14] = 0.00000000016384;
	radii[13] = 0.0000000008192;
	radii[12] = 0.000000004;
	radii[11] = 0.00000002;
	radii[10] = 0.000000102;
	radii[9] = 0.000000512;
	radii[8] = 0.00000256;
	radii[7] = 0.0000128;
	radii[6] = 0.000064;
	radii[5] = 0.00032;
	radii[4] = 0.0016;
	radii[3] = 0.008;
	radii[2] = 0.04;
	radii[1] = 0.2;
	radii[0] = 1;

	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	
	if (tid < N) {
		results[blockIdx.x] = floor(fmod(blockIdx.x*radii[threadIdx.x], (float)5));
		
	}
}



int main() {

	float h_input[15][16] = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
	
	float res_output[15][16];
	float * d_input;
	size_t pitch;
	checkCudaErrors(cudaMallocPitch(reinterpret_cast<void **>(&d_input), &pitch, 15*sizeof(float), 16));
	checkCudaErrors(cudaMemcpy(d_input, h_input, 15*16*sizeof(float), cudaMemcpyHostToDevice));
	
	radix_count<<<16,15>>>(d_input, 16);
	checkCudaErrors(cudaMemcpy(res_output, d_input, 15*16*sizeof(float), cudaMemcpyDeviceToHost));
	
	for (int i = 0; i < 15; i++) {
		for (int j = 0; j < 16; j++) {
			printf("%f,", res_output[i][j]);		
		}
		printf("\n");	
	}
	return 0;
}

not working yet but compiles and runs lol also dont think thats the right way to use shared memory

// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"
#include <cuda.h>

const int N = 1024;


__global__ void radix_count(float * results, float * radii, float * init_shared_mem, int n) {

	int tid = threadIdx.x; 
	int tid2 = threadIdx.x;
	__shared__ float eyes[15];
	if (tid2 < 15) {
		eyes[threadIdx.x] = init_shared_mem[tid2];
	}

	while (tid < n*15) {
		results[tid] = floor(fmod(eyes[threadIdx.x]*radii[threadIdx.x], (float)5));
		eyes[threadIdx.x] += 1;
		tid += 15;
	}
}

int main() {

	float h_input[N*15]; 
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			h_input[j+(i*15)] = 0;		
		}	
	}
	float h_constants[15] = {1, 0.2, 0.008, 0.0016, 0.00032, 0.000064, 0.0000128, 0.00000256, 0.000000512, 0.000000102,
				 0.00000002, 0.000000004, 0.0000000008192, 0.00000000016384};
	float res_output[N*15];
	float * d_input;
	float h_shared_mem[15] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	float * d_shared_mem;
	float * d_constants;

	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_input), 15*N*sizeof(float)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_constants), 15*sizeof(float)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_shared_mem), 15*sizeof(float)));
	checkCudaErrors(cudaMemcpy(d_input, h_input, 15*N*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_constants, h_constants, 15*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_shared_mem, h_shared_mem, 15*sizeof(float), cudaMemcpyHostToDevice));
	radix_count<<<N,15, 15*sizeof(float)>>>(d_input, d_constants, d_shared_mem, N);
	checkCudaErrors(cudaMemcpy(res_output, d_input, 15*N*sizeof(float), cudaMemcpyDeviceToHost));
	
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			printf("%f|", res_output[j+(i*15)]);		
		}
		printf("\n");	
	}
	return 0;
}

lets goo!!!!! no idea whether its more efficient than cpu rn lol
// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"
#include <cuda.h>

const int N = 16;


__global__ void radix_count(float * results, float * radii, float * init_shared_mem, float * init_map, int n) {

	int tid = threadIdx.x; 
	int tid2 = threadIdx.x;
	__shared__ float eyes[15];
	__shared__ float map[5];
	for (int i = 0; i < 5; i++) {
		map[i] = init_map[i];	
	}
	if (tid2 < 15) {
		eyes[threadIdx.x] = init_shared_mem[tid2];
	}

	if (tid < n*15) {
		results[tid] = floor(fmod(eyes[threadIdx.x]*radii[threadIdx.x], (float)5));
		//eyes[threadIdx.x] += 1;
		//tid += 15;
		results[tid+15] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+1)*radii[threadIdx.x], (float)5)))];
		results[tid+30] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+2)*radii[threadIdx.x], (float)5)))];
		results[tid+45] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+3)*radii[threadIdx.x], (float)5)))];
		results[tid+60] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+4)*radii[threadIdx.x], (float)5)))];
		results[tid+75] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+5)*radii[threadIdx.x], (float)5)))];
		results[tid+90] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+6)*radii[threadIdx.x], (float)5)))];
		results[tid+105] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+7)*radii[threadIdx.x], (float)5)))];
		results[tid+120] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+8)*radii[threadIdx.x], (float)5)))];
		results[tid+135] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+9)*radii[threadIdx.x], (float)5)))];
		results[tid+150] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+10)*radii[threadIdx.x], (float)5)))];
		results[tid+165] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+11)*radii[threadIdx.x], (float)5)))];
		results[tid+180] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+12)*radii[threadIdx.x], (float)5)))];
		results[tid+195] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+13)*radii[threadIdx.x], (float)5)))];
		results[tid+210] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+14)*radii[threadIdx.x], (float)5)))];
		results[tid+225] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+15)*radii[threadIdx.x], (float)5)))];
	}
}

int main() {
	float h_map[5] = {7.6,8.7,9.8,10.9,11.10};
	int h_input[N*15]; 
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			h_input[j+(i*15)] = 0;		
		}	
	}
	float h_constants[15] = {1, 0.2, 0.008, 0.0016, 0.00032, 0.000064, 0.0000128, 0.00000256, 0.000000512, 0.000000102,
				 0.00000002, 0.000000004, 0.0000000008192, 0.00000000016384};
	float res_output[N*15];
	float * d_input;
	float h_shared_mem[15] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	float * d_shared_mem;
	float * d_constants;
	float * d_map;
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_map), 5*sizeof(float)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_input), 15*N*sizeof(float)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_constants), 15*sizeof(float)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_shared_mem), 15*sizeof(float)));
	checkCudaErrors(cudaMemcpy(d_map, h_map, 5*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_input, h_input, 15*N*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_constants, h_constants, 15*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_shared_mem, h_shared_mem, 15*sizeof(float), cudaMemcpyHostToDevice));
	radix_count<<<N,15, 15*sizeof(float)+5*sizeof(float)>>>(d_input, d_constants, d_shared_mem, d_map, N);
	checkCudaErrors(cudaMemcpy(res_output, d_input, 15*N*sizeof(float), cudaMemcpyDeviceToHost));
	
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			printf("%f|", res_output[j+(i*15)]);		
		}
		printf("\n");	
	}
	return 0;
}
// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"
#include <cuda.h>

const int N = 16;


__global__ void radix_count(float2 * results, float * radii, float * init_shared_mem, float2 * init_map, int n) {

	int tid = threadIdx.x; 
	int tid2 = threadIdx.x;
	__shared__ float eyes[15];
	__shared__ float2 map[5];
	for (int i = 0; i < 5; i++) {
		map[i] = init_map[i];	
	}
	if (tid2 < 15) {
		eyes[threadIdx.x] = init_shared_mem[tid2];
	}

	if (tid < n*15) {
		results[tid] = map[static_cast<int>(floor(fmod(eyes[threadIdx.x]*radii[threadIdx.x], (float)5)))];
		//eyes[threadIdx.x] += 1;
		//tid += 15;
		results[tid+15] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+1)*radii[threadIdx.x], (float)5)))];
		results[tid+30] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+2)*radii[threadIdx.x], (float)5)))];
		results[tid+45] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+3)*radii[threadIdx.x], (float)5)))];
		results[tid+60] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+4)*radii[threadIdx.x], (float)5)))];
		results[tid+75] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+5)*radii[threadIdx.x], (float)5)))];
		results[tid+90] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+6)*radii[threadIdx.x], (float)5)))];
		results[tid+105] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+7)*radii[threadIdx.x], (float)5)))];
		results[tid+120] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+8)*radii[threadIdx.x], (float)5)))];
		results[tid+135] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+9)*radii[threadIdx.x], (float)5)))];
		results[tid+150] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+10)*radii[threadIdx.x], (float)5)))];
		results[tid+165] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+11)*radii[threadIdx.x], (float)5)))];
		results[tid+180] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+12)*radii[threadIdx.x], (float)5)))];
		results[tid+195] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+13)*radii[threadIdx.x], (float)5)))];
		results[tid+210] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+14)*radii[threadIdx.x], (float)5)))];
		results[tid+225] = map[static_cast<int>(floor(fmod((eyes[threadIdx.x]+15)*radii[threadIdx.x], (float)5)))];
	}
}

int main() {
	float2 h_input[N*15]; 
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			h_input[j+(i*15)].x = 0;
			h_input[j+(i*15)].y = 0;		
		}	
	}
	float h_constants[15] = {1, 0.2, 0.008, 0.0016, 0.00032, 0.000064, 0.0000128, 0.00000256, 0.000000512, 0.000000102,
				 0.00000002, 0.000000004, 0.0000000008192, 0.00000000016384};
	float2 h_map[5];
	h_map[0].x = 0;
	h_map[0].y = 0;
	h_map[1].x = 1;
	h_map[1].y = 0;
	h_map[2].x = -1;
	h_map[2].y = 0;
	h_map[3].x = 0;
	h_map[3].y = 1;
	h_map[4].x = 0;
	h_map[4].y = -1;
	float2 * d_map;
	float2 res_output[N*15];
	float2 * d_input;
	float h_shared_mem[15] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	float * d_shared_mem;
	float * d_constants;
	
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_map), 5*sizeof(float2)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_input), 15*N*sizeof(float2)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_constants), 15*sizeof(float)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_shared_mem), 15*sizeof(float)));
	checkCudaErrors(cudaMemcpy(d_map, h_map, 5*sizeof(float2), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_input, h_input, 15*N*sizeof(float2), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_constants, h_constants, 15*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_shared_mem, h_shared_mem, 15*sizeof(float), cudaMemcpyHostToDevice));
	radix_count<<<N,15,(15*sizeof(float)+5*sizeof(float2))>>>(d_input, d_constants, d_shared_mem, d_map, N);
	checkCudaErrors(cudaMemcpy(res_output, d_input, 15*N*sizeof(float2), cudaMemcpyDeviceToHost));
	
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			printf("|%f,%f|", res_output[j+(i*15)].x, res_output[j+(i*15)].y);		
		}
		printf("_\n");	
	}
	checkCudaErrors(cudaFree(d_input));
	checkCudaErrors(cudaFree(d_constants));
	checkCudaErrors(cudaFree(d_shared_mem));
	checkCudaErrors(cudaFree(d_map));
	return 0;
}
// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"
#include <cuda.h>

const int N = 16;


__global__ void radix_count(float2 * results, float * radii, float * init_shared_mem, float2 * init_map, int n) {

	int tid = threadIdx.x; 
	int tid2 = threadIdx.x;
	__shared__ float eyes[15];
	__shared__ float2 map[5];
	for (int i = 0; i < 5; i++) {
		map[i] = init_map[i];	
	}
	if (tid2 < 15) {
		eyes[threadIdx.x] = init_shared_mem[tid2];
	}

	while (tid < n*15) {
		results[tid] = map[static_cast<int>(floor(fmod(eyes[threadIdx.x]*radii[threadIdx.x], (float)5)))];
		eyes[threadIdx.x] += 1;
		tid += 15;
	}
}

int main() {

	float2 h_input[N*15]; 
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			h_input[j+(i*15)].x = 0;
			h_input[j+(i*15)].y = 0;		
		}	
	}
	float h_constants[15] = {1, 0.2, 0.008, 0.0016, 0.00032, 0.000064, 0.0000128, 0.00000256, 0.000000512, 0.000000102,
				 0.00000002, 0.000000004, 0.0000000008192, 0.00000000016384};
	float2 h_map[5];
	h_map[0].x = 0;
	h_map[0].y = 0;
	h_map[1].x = 1;
	h_map[1].y = 0;
	h_map[2].x = -1;
	h_map[2].y = 0;
	h_map[3].x = 0;
	h_map[3].y = 1;
	h_map[4].x = 0;
	h_map[4].y = -1;
	float2 * d_map;
	float2 res_output[N*15];
	float2 * d_input;
	float h_shared_mem[15] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	float * d_shared_mem;
	float * d_constants;
	
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_map), 5*sizeof(float2)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_input), 15*N*sizeof(float2)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_constants), 15*sizeof(float)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_shared_mem), 15*sizeof(float)));
	checkCudaErrors(cudaMemcpy(d_map, h_map, 5*sizeof(float2), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_input, h_input, 15*N*sizeof(float2), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_constants, h_constants, 15*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_shared_mem, h_shared_mem, 15*sizeof(float), cudaMemcpyHostToDevice));
	radix_count<<<N,15,(15*sizeof(float)+5*sizeof(float2))>>>(d_input, d_constants, d_shared_mem, d_map, N);
	checkCudaErrors(cudaMemcpy(res_output, d_input, 15*N*sizeof(float2), cudaMemcpyDeviceToHost));
	
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < 15; j++) {
			printf("|%f,%f|", res_output[j+(i*15)].x, res_output[j+(i*15)].y);		
		}
		printf("_\n");	
	}
	checkCudaErrors(cudaFree(d_input));
	checkCudaErrors(cudaFree(d_constants));
	checkCudaErrors(cudaFree(d_shared_mem));
	checkCudaErrors(cudaFree(d_map));
	return 0;
	
}



// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"
#include <cuda.h>

const int N = 1024;
const int M = 32;
const int vector_size = 15;

__global__ void kernel(float2 * result, float * init_eyes, float * init_coeff) {
	
	__shared__ int eyes[N];
	__shared__ float reduction_coeff[N];
	int tid = threadIdx.x;
	int tid2 = threadIdx.x;
	if (tid < N) {
		eyes[threadIdx.x] = init_eyes[tid];	
	}
	if (tid2 < N) {
		reduction_coeff[threadIdx.x] = init_coeff[tid];	
	}		
	

	int index_entry = blockIdx.x * blockDim.x + threadIdx.x;

	if (index_entry < N*M) {
		result[index_entry].x = floor(fmod(eyes[index_entry%N]*reduction_coeff[index_entry%N], (float)5));
	}
}

void init_eyes(float * arr, int a, int b){
	int cnt = 0; 
	int fill_val = 0;
	int c = (int)a/b;
	for (int i = 0; i < a; i++) {
		if (cnt == b) {
			fill_val++;
			cnt = 0;
			if (fill_val == c) {
				fill_val = 0;			
			}		
		}
		arr[i] = fill_val;
		cnt++;	
	}
}

void init_reduction_coeff(float * arr, float * consts, int a, int b){
	int c = (int)a/b;
	int d = a%b;
	for (int i = 0; i < a; i++) {
		if (i >= (b-d-1)) {
			arr[i] = 0;		
		}
		arr[i] = consts[i%15];
	}
}

int main() {
	float h_constants[15] = {1, 0.2, 0.04, 0.008, 0.0016, 0.00032, 0.000064, 0.0000128, 0.00000256, 0.000000512, 0.000000102,
				 0.00000002, 0.000000004, 0.0000000008192, 0.00000000016384};


	// to hold result from gpu computation
	float2 * d_input;
	float2 res_output[N*M];
	// vector to store initial shared mem 
	float h_eyes[N];
	float * d_eyes;
	float h_thread_row_constants[N];
	float * d_constants;
	// init host arrays to be transferred to shared mem
	init_eyes(h_eyes, N, vector_size);
	init_reduction_coeff(h_thread_row_constants, h_constants, N, vector_size);


	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_input), M*N*sizeof(float2)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_eyes), N*sizeof(int)));
	checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_constants), N*sizeof(float)));
	checkCudaErrors(cudaMemcpy(d_eyes, h_eyes, N*sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_constants, h_thread_row_constants, N*sizeof(float), cudaMemcpyHostToDevice));
	dim3 threads_per_block(N, 1);	
	dim3 blocks_per_grid(M,1);	
	kernel<<<blocks_per_grid, threads_per_block>>>(d_input, d_eyes, d_constants);
	
	checkCudaErrors(cudaMemcpy(res_output, d_input, M*N*sizeof(float2), cudaMemcpyDeviceToHost));
	for (int i = 0; i < N; i++) {
		printf("%f,", res_output[i].x);	
		if ((i+1)%15 == 0) {
			printf("\n");		
		}
	}
	
	
	checkCudaErrors(cudaFree(d_input));
	checkCudaErrors(cudaFree(d_eyes));
}



bit by bit boys ;) pun intended!!!

I’m not sure what is going on with this thread. Posting code or code updates with no written context at all isn’t particularly useful, and posting pictures with no context at all isn’t what this forum is about. Please desist from this activity. If you’re using this forum thread as your own personal scratchpad, please stop.

It didn’t seem like a big deal to track my progress on here, I was having fun learning and think CUDA is really cool. I apologize Robert I will desist from this activity.

There are great resources like github and pastebin for that. Thanks.

1 Like

This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.