Skip to content
Extraits de code Groupes Projets
Valider 4d6479bd rédigé par enzo.decarvalhobittencourt's avatar enzo.decarvalhobittencourt
Parcourir les fichiers

reduced memory transfer with an atomic add

parent 5272811e
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -12,7 +12,11 @@ ...@@ -12,7 +12,11 @@
#define APM_DEBUG 0 #define APM_DEBUG 0
#define NUMBER_THREADS_BY_BLOCK 1024 #define NUMBER_THREADS_BY_BLOCK 1024
#define MAX_BUFFER_SIZE 1024000 //102.4M file at most #define MAX_BUFFER_SIZE 102400000 //102.4M file at most
//MAX_BUFFER_SIZE should be dividable by NUMBER_THREADS_BY_BLOCK (for better
// memory management).
//the amount of RAM used by the programm is approx~ 30*MAX_BUFFER_SIZE.
//If you get out of memory errors, you should reduce this value
char * read_input_file(char * filename, int * size) char * read_input_file(char * filename, int * size)
{ {
...@@ -144,7 +148,7 @@ int levenshtein(char *s1, char *s2, int len, int * column, int approx_factor) ...@@ -144,7 +148,7 @@ int levenshtein(char *s1, char *s2, int len, int * column, int approx_factor)
} }
__global__ void levenshtein_cu(char *find, char *buf, int len, int n_bytes __global__ void levenshtein_cu(char *find, char *buf, int len, int n_bytes
,int approx_factor, int* g_column, char* result_vec) ,int approx_factor, int* g_column, int* result)
{ {
int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id
if (tId > n_bytes) if (tId > n_bytes)
...@@ -175,9 +179,13 @@ __global__ void levenshtein_cu(char *find, char *buf, int len, int n_bytes ...@@ -175,9 +179,13 @@ __global__ void levenshtein_cu(char *find, char *buf, int len, int n_bytes
lastdiag = olddiag; lastdiag = olddiag;
} }
} }
int res = 0;
if (column[len] <= approx_factor) if (column[len] <= approx_factor)
{result_vec[tId] = 1;}//its a match res = 1;
//{result_vec[tId] = 1;}//its a match
atomicAdd(result, res);
} }
int main(int argc, char ** argv) int main(int argc, char ** argv)
...@@ -306,7 +314,17 @@ int main(int argc, char ** argv) ...@@ -306,7 +314,17 @@ int main(int argc, char ** argv)
, cudaGetErrorString(cu_err)); , cudaGetErrorString(cu_err));
return 1; return 1;
} }
int result = 0;
int* result_dev;
cudaMalloc((void**)&result_dev, sizeof(int));
if ((cu_err = cudaGetLastError()) != cudaSuccess)
{
fprintf(stderr, "Unable to allocate result on device: %s.\n"
, cudaGetErrorString(cu_err));
return 1;
}
off_t offset = 0; off_t offset = 0;
while (offset < filesize) while (offset < filesize)
...@@ -367,7 +385,8 @@ int main(int argc, char ** argv) ...@@ -367,7 +385,8 @@ int main(int argc, char ** argv)
fprintf(stderr, "Unable to allocate result vec on host."); fprintf(stderr, "Unable to allocate result vec on host.");
return 1; return 1;
} }
/*
cudaMemset(result_vec_dev, 0, NTBB*NB*sizeof(char)); cudaMemset(result_vec_dev, 0, NTBB*NB*sizeof(char));
if ((cu_err = cudaGetLastError()) != cudaSuccess) if ((cu_err = cudaGetLastError()) != cudaSuccess)
{ {
...@@ -375,10 +394,18 @@ int main(int argc, char ** argv) ...@@ -375,10 +394,18 @@ int main(int argc, char ** argv)
, cudaGetErrorString(cu_err)); , cudaGetErrorString(cu_err));
return 1; return 1;
} }
cudaMemset(result_dev, 0, sizeof(int));
if ((cu_err = cudaGetLastError()) != cudaSuccess)
{
fprintf(stderr, "Unable to init result to 0 on device: %s.\n"
, cudaGetErrorString(cu_err));
return 1;
}
*/
levenshtein_cu<<<NB,NTBB>>>(pattern_dev, buf_dev, size_pattern levenshtein_cu<<<NB,NTBB>>>(pattern_dev, buf_dev, size_pattern
, n_bytes, approx_factor, column_dev, result_vec_dev); , n_bytes, approx_factor, column_dev, result_dev);
if ((cu_err = cudaGetLastError()) != cudaSuccess) if ((cu_err = cudaGetLastError()) != cudaSuccess)
{ {
fprintf(stderr, "Kernel execution of levenshtein_cu failed: %s.\n" fprintf(stderr, "Kernel execution of levenshtein_cu failed: %s.\n"
...@@ -387,19 +414,21 @@ int main(int argc, char ** argv) ...@@ -387,19 +414,21 @@ int main(int argc, char ** argv)
} }
//get result //get result
cudaMemcpy(result_vec, result_vec_dev cudaMemcpy(&result, result_dev
, NTBB*NB*sizeof(char), cudaMemcpyDeviceToHost); , sizeof(int), cudaMemcpyDeviceToHost);
if ((cu_err = cudaGetLastError()) != cudaSuccess) if ((cu_err = cudaGetLastError()) != cudaSuccess)
{ {
fprintf(stderr, "Unable to retrieve result vector on host: %s.\n" fprintf(stderr, "Unable to retrieve result on host: %s.\n"
, cudaGetErrorString(cu_err)); , cudaGetErrorString(cu_err));
return 1; return 1;
} }
//TODO MAYBE - reduction on gpu. n_matches[i] += result;
/*
for (int j = 0 ; j<n_bytes ; j++) for (int j = 0 ; j<n_bytes ; j++)
{ {
/* Highly advanced debbugging (printfs) // Highly advanced debbugging (printfs)
int column[size_pattern+1]; int column[size_pattern+1];
int d = int d =
levenshtein(pattern[i], buf+j, size_pattern, column levenshtein(pattern[i], buf+j, size_pattern, column
...@@ -411,9 +440,10 @@ int main(int argc, char ** argv) ...@@ -411,9 +440,10 @@ int main(int argc, char ** argv)
,pattern[i], j); ,pattern[i], j);
printf("%.*s\n",size_pattern,&buf[j]); printf("%.*s\n",size_pattern,&buf[j]);
} }
*/
n_matches[i] += result_vec[j]; n_matches[i] += result_vec[j];
} }
*/
offset += (MAX_BUFFER_SIZE - size_pattern + 1); offset += (MAX_BUFFER_SIZE - size_pattern + 1);
...@@ -433,7 +463,8 @@ int main(int argc, char ** argv) ...@@ -433,7 +463,8 @@ int main(int argc, char ** argv)
, cudaGetErrorString(cu_err)); , cudaGetErrorString(cu_err));
return 1; return 1;
} }
/*
cudaFree(result_vec_dev); cudaFree(result_vec_dev);
if ((cu_err = cudaGetLastError()) != cudaSuccess) if ((cu_err = cudaGetLastError()) != cudaSuccess)
{ {
...@@ -443,6 +474,7 @@ int main(int argc, char ** argv) ...@@ -443,6 +474,7 @@ int main(int argc, char ** argv)
} }
free(result_vec); free(result_vec);
*/
} }
//free memory - and then get onto the next pattern. //free memory - and then get onto the next pattern.
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter