Yosef
June 5, 2016, 10:26am
1
when I run the following code:
#define THREADS_BLOCKS_IN_THE_GRID 8
#define NUM_OF_THREADS 1024
#define PIXELS 1024
global void Compare (unsigned char *sub_dm_ORG , unsigned char *dm_SMP , float *successPercent , int rows_SMP)
{
shared float temp[NUM_OF_THREADS];
int index = threadIdx.x + blockIdx.x * blockDim.x;
.
.
.
}
In the main:
int threadBlockInTheGrid = (PIXELS + NUM_OF_THREADS) / (NUM_OF_THREADS+1) ;
hashCompare<<<threadBlockInTheGrid , NUM_OF_THREADS >>> (Image_ORG , Image_SMP , d_successPercent , rows_SMP);
for pixels = 1024 points the code works fine.
for more than pixels = 1024, the code compile and not running
Someone can help?
in the failing case, try running your code with cuda-memcheck
int threadBlockInTheGrid = (PIXELS + NUM_OF_THREADS) / (NUM_OF_THREADS+1) ;
should be:
int threadBlockInTheGrid = (PIXELS + NUM_OF_THREADS -1) / NUM_OF_THREADS ;
Yosef
June 8, 2016, 3:07pm
4
I tried to change the code to this.
but it is still not working for more then 1024 points:
my code is:
#include <stdio.h>
#include <time.h>
#define THREAD_NUMBER 1024 // Number of Threads inside the block
#define ELEMENT_NUMBER 1024
global void Compare (unsigned char *sub_dm_ORG , unsigned char *dm_SMP , float *successPercent)
{
shared float temp[THREAD_NUMBER];
int index = threadIdx.x + blockIdx.x * blockDim.x;
.
.
.
}
int main()
{
clock_t startTime = clock();
.
.
.
int BlockNumber = ELEMENT_NUMBER / THREAD_NUMBER;
//int ThreadBlockInTheGrid = (ELEMENT_NUMBER + THREAD_NUMBER) / (THREAD_NUMBER+1) ;
hashCompare<<<BlockNumber , THREAD_NUMBER >>> (Image_ORG , Image_SMP , d_successPercent);
.
.
.
return 0;
}
If I have more then 1 block.
does i need to control the thread work dividing? and how?
I can not understand how to use this kernel call.
I need 50,000 elements.
int BlockNumber = (ELEMENT_NUMBER + THREAD_NUMBER -1) / THREAD_NUMBER; // round-up div.
Yosef
June 9, 2016, 9:21am
6
unfortunately,If I change the ELEMENT_NUMBER to 10240 (for exmpale) It’s still not working )-:
If I change the ELEMENT_NUMBER to 10240 (for exmpale) It’s still not working )-:
see following sample. it works.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
__global__ void kernel_compare(const int *a, const int *b, int* diff_count, unsigned int size) {
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i == 0) {
*diff_count = 0;
}
__syncthreads();
if (i < size) {
if (a[i] != b[i]) {
atomicAdd(diff_count, 1);
}
}
}
#include <iostream>
int main() {
const int n = 10240;
int* a;
cudaMalloc(&a, n*sizeof(int));
cudaMemset(a, 0, n * sizeof(int)); // fill 0
int* b;
cudaMalloc(&b, n * sizeof(int));
cudaMemset(b, 0, n * sizeof(int)); // fill 0
cudaMemset(b, 1, (n/2)*sizeof(int)); // half of b[] are non-0
int* count;
cudaMalloc(&count, sizeof(int));
kernel_compare<<<(n + 255)/256, 256>>>(a, b, count, n);
int result;
cudaMemcpy(&result, count, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << result << " of " << n << " are different\n";
cudaFree(count);
cudaFree(b);
cudaFree(a);
}