From 4d6479bdef17c75af5fef0aea20d91666762c9db Mon Sep 17 00:00:00 2001 From: "enzo.decarvalhobittencourt" <ezdecarvalho@gmail.com> Date: Wed, 20 Dec 2023 17:41:08 +0100 Subject: [PATCH] reduced memory transfer with an atomic add --- Projet/CODE/apm/src/apm_gpu.cu | 64 +++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/Projet/CODE/apm/src/apm_gpu.cu b/Projet/CODE/apm/src/apm_gpu.cu index 344d55e..ab42d36 100644 --- a/Projet/CODE/apm/src/apm_gpu.cu +++ b/Projet/CODE/apm/src/apm_gpu.cu @@ -12,7 +12,11 @@ #define APM_DEBUG 0 #define NUMBER_THREADS_BY_BLOCK 1024 -#define MAX_BUFFER_SIZE 1024000 //102.4M file at most +#define MAX_BUFFER_SIZE 102400000 //102.4M file at most +//MAX_BUFFER_SIZE should be dividable by NUMBER_THREADS_BY_BLOCK (for better +// memory management). +//the amount of RAM used by the programm is approx~Â 30*MAX_BUFFER_SIZE. +//If you get out of memory errors, you should reduce this value char * read_input_file(char * filename, int * size) { @@ -144,7 +148,7 @@ int levenshtein(char *s1, char *s2, int len, int * column, int approx_factor) } __global__ void levenshtein_cu(char *find, char *buf, int len, int n_bytes - ,int approx_factor, int* g_column, char* result_vec) + ,int approx_factor, int* g_column, int* result) { int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id if (tId > n_bytes) @@ -175,9 +179,13 @@ __global__ void levenshtein_cu(char *find, char *buf, int len, int n_bytes lastdiag = olddiag; } } - + + int res = 0; if (column[len] <= approx_factor) - {result_vec[tId] = 1;}//its a match + res = 1; + //{result_vec[tId] = 1;}//its a match + + atomicAdd(result, res); } int main(int argc, char ** argv) @@ -306,7 +314,17 @@ int main(int argc, char ** argv) , cudaGetErrorString(cu_err)); return 1; } - + + int result = 0; + int* result_dev; + cudaMalloc((void**)&result_dev, sizeof(int)); + if ((cu_err = cudaGetLastError()) != cudaSuccess) + { + fprintf(stderr, "Unable to allocate result on device: %s.\n" + , cudaGetErrorString(cu_err)); + return 1; + } + off_t offset = 0; while (offset < filesize) @@ -367,7 +385,8 @@ int main(int argc, char ** argv) fprintf(stderr, "Unable to allocate result vec on host."); return 1; } - + + /* cudaMemset(result_vec_dev, 0, NTBB*NB*sizeof(char)); if ((cu_err = cudaGetLastError()) != cudaSuccess) { @@ -375,10 +394,18 @@ int main(int argc, char ** argv) , cudaGetErrorString(cu_err)); return 1; } - + + cudaMemset(result_dev, 0, sizeof(int)); + if ((cu_err = cudaGetLastError()) != cudaSuccess) + { + fprintf(stderr, "Unable to init result to 0 on device: %s.\n" + , cudaGetErrorString(cu_err)); + return 1; + } + */ levenshtein_cu<<<NB,NTBB>>>(pattern_dev, buf_dev, size_pattern - , n_bytes, approx_factor, column_dev, result_vec_dev); + , n_bytes, approx_factor, column_dev, result_dev); if ((cu_err = cudaGetLastError()) != cudaSuccess) { fprintf(stderr, "Kernel execution of levenshtein_cu failed: %s.\n" @@ -387,19 +414,21 @@ int main(int argc, char ** argv) } //get result - cudaMemcpy(result_vec, result_vec_dev - , NTBB*NB*sizeof(char), cudaMemcpyDeviceToHost); + cudaMemcpy(&result, result_dev + , sizeof(int), cudaMemcpyDeviceToHost); if ((cu_err = cudaGetLastError()) != cudaSuccess) { - fprintf(stderr, "Unable to retrieve result vector on host: %s.\n" + fprintf(stderr, "Unable to retrieve result on host: %s.\n" , cudaGetErrorString(cu_err)); return 1; } - - //TODO MAYBE - reduction on gpu. + + n_matches[i] += result; + + /* for (int j = 0 ; j<n_bytes ; j++) { - /* Highly advanced debbugging (printfs) + // Highly advanced debbugging (printfs) int column[size_pattern+1]; int d = levenshtein(pattern[i], buf+j, size_pattern, column @@ -411,9 +440,10 @@ int main(int argc, char ** argv) ,pattern[i], j); printf("%.*s\n",size_pattern,&buf[j]); } - */ + n_matches[i] += result_vec[j]; } + */ offset += (MAX_BUFFER_SIZE - size_pattern + 1); @@ -433,7 +463,8 @@ int main(int argc, char ** argv) , cudaGetErrorString(cu_err)); return 1; } - + + /* cudaFree(result_vec_dev); if ((cu_err = cudaGetLastError()) != cudaSuccess) { @@ -443,6 +474,7 @@ int main(int argc, char ** argv) } free(result_vec); + */ } //free memory - and then get onto the next pattern. -- GitLab