diff --git a/Projet/CODE/apm/src/apm_gpu.cu b/Projet/CODE/apm/src/apm_gpu.cu index 9526660230da152ca1de0dfcb76e364b8653cd69..344d55ea25cb2edbdfb04a0f5dcfec33cb3e948d 100644 --- a/Projet/CODE/apm/src/apm_gpu.cu +++ b/Projet/CODE/apm/src/apm_gpu.cu @@ -11,6 +11,8 @@ #include <sys/time.h> #define APM_DEBUG 0 +#define NUMBER_THREADS_BY_BLOCK 1024 +#define MAX_BUFFER_SIZE 1024000 //102.4M file at most char * read_input_file(char * filename, int * size) { @@ -32,9 +34,9 @@ char * read_input_file(char * filename, int * size) lseek(fd, 0, SEEK_SET); /* TODO check return of lseek */ -#if APM_DEBUG + #if APM_DEBUG printf("File length: %lld\n", fsize); -#endif + #endif /* Allocate data to copy the target text */ buf = (char *)malloc(fsize * sizeof (char)); @@ -54,15 +56,67 @@ char * read_input_file(char * filename, int * size) return NULL; } -#if APM_DEBUG + #if APM_DEBUG printf("Number of read bytes: %d\n", n_bytes); -#endif + #endif *size = n_bytes; close(fd); return buf; } +off_t get_file_size(int fd) +{ + off_t size; + size = lseek(fd, 0, SEEK_END); + lseek(fd, 0, SEEK_SET); + return size; +} +char * read_input_file_max(int fd, int * size + , int pattern_len, off_t offset) +{ + char * buf; + off_t fsize; + int n_bytes = 1; + int to_read; + + fsize = lseek(fd, 0, SEEK_END); + to_read = + ((fsize - offset) > MAX_BUFFER_SIZE) ? MAX_BUFFER_SIZE : (fsize-offset); + lseek(fd, offset, SEEK_SET); + + #if APM_DEBUG + printf("File length: %lld\n", fsize); + printf("diff: %i\n", to_read); + #endif + + /* Allocate data to copy the target text */ + buf = (char *)malloc(fsize * sizeof (char)); + if (buf == NULL) + { + fprintf(stderr, "Unable to allocate %ld byte(s) for main array\n", + fsize); + return NULL; + } + + n_bytes = read(fd, buf, to_read); + if (n_bytes != to_read) + { + fprintf(stderr, + "Unable to copy %ld byte(s) from text file (%d byte(s) copied)\n", + fsize, n_bytes); + return NULL; + } + + #if APM_DEBUG + printf("Number of read bytes: %d\n", n_bytes); + #endif + + *size = n_bytes; + return buf; +} + + #define MIN3(a, b, c) ((a)<(b) ? ((a)<(c) ? (a) : (c)) : ((b)<(c) ? (b) : (c))) int levenshtein(char *s1, char *s2, int len, int * column, int approx_factor) @@ -130,6 +184,8 @@ int main(int argc, char ** argv) { char ** pattern; char * filename; + int fdesc; + off_t filesize; int approx_factor = 0; int nb_patterns = 0; @@ -141,11 +197,11 @@ int main(int argc, char ** argv) //cuda-related vars char *buf_dev; - int NTBB = 1024; //Number of threads by blocks - int NB;//Number of blocks + int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks + int NB = 0;//Number of blocks cudaError_t cu_err; - - /* Check number of arguments */ + + /* Check number of arguments */ if (argc < 4) { printf("Usage: %s approximation_factor " @@ -156,7 +212,23 @@ int main(int argc, char ** argv) approx_factor = atoi(argv[1]);/* Get the distance factor */ filename = argv[2];/* Grab the filename containing the target text */ - nb_patterns = argc - 3;/* Get the number of patterns to search for */ + + /* Open the text file */ + fdesc = open(filename, O_RDONLY); + if (fdesc == -1) + { + fprintf(stderr, "Unable to open the text file <%s>\n", filename); + return 1; + } + + filesize = get_file_size(fdesc); + if (filesize == 0) + { + fprintf(stderr, "File <%s> is empty !\n", filename); + return 1; + } + + nb_patterns = argc - 3;/* Get the number of patterns to search for */ pattern = (char **)malloc(nb_patterns * sizeof(char*)); if (pattern == NULL)/*Fill the pattern*/ @@ -193,28 +265,7 @@ int main(int argc, char ** argv) "looking for %d pattern(s) in file %s w/ distance of %d\n", nb_patterns, filename, approx_factor); - buf = read_input_file(filename, &n_bytes); - if (buf == NULL) - { - fprintf(stderr, "Error: NULL pointer from reading input file."); - return 1; - } - cudaMalloc((void**)&buf_dev, n_bytes * sizeof(char)); - if ((cu_err = cudaGetLastError()) != cudaSuccess) - { - fprintf(stderr, "Unable to allocate buffer on device: %s.\n" - , cudaGetErrorString(cu_err)); - return 1; - } - - cudaMemcpy(buf_dev, buf, n_bytes, cudaMemcpyHostToDevice); - if ((cu_err = cudaGetLastError()) != cudaSuccess) - { - fprintf(stderr, "Unable to copy buffer onto device: %s.\n" - , cudaGetErrorString(cu_err)); - return 1; - } n_matches = (int *)malloc(nb_patterns * sizeof(int));/*Alloc the matches*/ if (n_matches == NULL) @@ -234,45 +285,73 @@ int main(int argc, char ** argv) for (int i = 0; i < nb_patterns; i++) { - n_matches[i] = 0; - - int size_pattern = strlen(pattern[i]); - char* pattern_dev; + n_matches[i] = 0; + + int size_pattern = strlen(pattern[i]); + char* pattern_dev; + + cudaMalloc((void**)&pattern_dev, size_pattern); + if ((cu_err = cudaGetLastError()) != cudaSuccess) + { + fprintf(stderr, "Unable to allocate pattern on device: %s.\n" + , cudaGetErrorString(cu_err)); + return 1; + } + + cudaMemcpy(pattern_dev, pattern[i] + , size_pattern, cudaMemcpyHostToDevice); + if ((cu_err = cudaGetLastError()) != cudaSuccess) + { + fprintf(stderr, "Unable to copy pattern onto device: %s.\n" + , cudaGetErrorString(cu_err)); + return 1; + } + + + off_t offset = 0; + while (offset < filesize) + { + #if APM_DEBUG + printf("offset: %i, filesize: %i\n",offset,filesize); + #endif + + buf = read_input_file_max(fdesc, &n_bytes, size_pattern, offset); + if (buf == NULL) + { + fprintf(stderr, "Error: NULL pointer from reading input file."); + return 1; + } - cudaMalloc((void**)&pattern_dev, size_pattern); + cudaMalloc((void**)&buf_dev, n_bytes * sizeof(char)); if ((cu_err = cudaGetLastError()) != cudaSuccess) { - fprintf(stderr, "Unable to allocate pattern on device: %s.\n" - , cudaGetErrorString(cu_err)); + fprintf(stderr, "Unable to allocate buffer on device: %s.\n" + , cudaGetErrorString(cu_err)); return 1; } - - cudaMemcpy(pattern_dev, pattern[i] - , size_pattern, cudaMemcpyHostToDevice); + + cudaMemcpy(buf_dev, buf, n_bytes, cudaMemcpyHostToDevice); if ((cu_err = cudaGetLastError()) != cudaSuccess) { - fprintf(stderr, "Unable to copy pattern onto device: %s.\n" - , cudaGetErrorString(cu_err)); + fprintf(stderr, "Unable to copy buffer onto device: %s.\n" + , cudaGetErrorString(cu_err)); return 1; } - - NB = (n_bytes / NTBB) + (((n_bytes % NTBB) > 0) ? 1 : 0); + NB = (n_bytes / NTBB) + (((n_bytes % NTBB) > 0) ? 1 : 0); + - //TODO err check int * column_dev; - cudaMalloc((void**)&column_dev, (size_pattern+1)*NTBB*NB*sizeof(int)); + cudaMalloc((void**)&column_dev + , (size_pattern+1)*NTBB*NB*sizeof(int)); if ((cu_err = cudaGetLastError()) != cudaSuccess) { fprintf(stderr, "Unable to allocate column vector on device: %s.\n" - , cudaGetErrorString(cu_err)); + , cudaGetErrorString(cu_err)); return 1; } - - - - //TODO err check + char * result_vec_dev; //result vectors. cudaMalloc((void**)&result_vec_dev, NTBB*NB*sizeof(char)); if ((cu_err = cudaGetLastError()) != cudaSuccess) @@ -281,7 +360,14 @@ int main(int argc, char ** argv) , cudaGetErrorString(cu_err)); return 1; } - + + char* result_vec =(char*) malloc(NTBB*NB*sizeof(char)); + if (result_vec == NULL) + { + fprintf(stderr, "Unable to allocate result vec on host."); + return 1; + } + cudaMemset(result_vec_dev, 0, NTBB*NB*sizeof(char)); if ((cu_err = cudaGetLastError()) != cudaSuccess) { @@ -290,13 +376,6 @@ int main(int argc, char ** argv) return 1; } - char* result_vec =(char*) malloc(NTBB*NB*sizeof(char)); - if (result_vec == NULL) - { - fprintf(stderr, "Unable to allocate result vec on host."); - return 1; - } - levenshtein_cu<<<NB,NTBB>>>(pattern_dev, buf_dev, size_pattern , n_bytes, approx_factor, column_dev, result_vec_dev); @@ -306,7 +385,7 @@ int main(int argc, char ** argv) , cudaGetErrorString(cu_err)); return 1; } - + //get result cudaMemcpy(result_vec, result_vec_dev , NTBB*NB*sizeof(char), cudaMemcpyDeviceToHost); @@ -316,14 +395,15 @@ int main(int argc, char ** argv) , cudaGetErrorString(cu_err)); return 1; } - + //TODO MAYBE - reduction on gpu. for (int j = 0 ; j<n_bytes ; j++) { /* Highly advanced debbugging (printfs) int column[size_pattern+1]; int d = - levenshtein(pattern[i], buf+j, size_pattern, column, approx_factor); + levenshtein(pattern[i], buf+j, size_pattern, column + , approx_factor); //printf("%d",d); if (d != result_vec[j]) { @@ -333,17 +413,19 @@ int main(int argc, char ** argv) } */ n_matches[i] += result_vec[j]; - } + } + + offset += (MAX_BUFFER_SIZE - size_pattern + 1); - //free memory - and then get onto the next pattern. - cudaFree(pattern_dev); + cudaFree(buf_dev); if ((cu_err = cudaGetLastError()) != cudaSuccess) { - fprintf(stderr, "Unable to free memory for pattern on device: %s.\n" + fprintf(stderr, "Unable to free memory for dev on device: %s.\n" , cudaGetErrorString(cu_err)); return 1; } - + free(buf); + cudaFree(column_dev); if ((cu_err = cudaGetLastError()) != cudaSuccess) { @@ -361,6 +443,16 @@ int main(int argc, char ** argv) } free(result_vec); + } + + //free memory - and then get onto the next pattern. + cudaFree(pattern_dev); + if ((cu_err = cudaGetLastError()) != cudaSuccess) + { + fprintf(stderr, "Unable to free memory for pattern on device: %s.\n" + , cudaGetErrorString(cu_err)); + return 1; + } } /* Timer stop */ diff --git a/Projet/CODE/apm/src/apm_omp.c b/Projet/CODE/apm/src/apm_omp.c index 3162e396f16e1e7c4af458cc8c88b0770475705f..853c628cb2318bfca1542ebd10f24c931a8487ac 100644 --- a/Projet/CODE/apm/src/apm_omp.c +++ b/Projet/CODE/apm/src/apm_omp.c @@ -104,6 +104,7 @@ int main(int argc, char ** argv) double duration; int n_bytes; int * n_matches; + int num_threads; /* Check number of arguments */ if (argc < 4) @@ -167,6 +168,11 @@ int main(int argc, char ** argv) nb_patterns * sizeof(int)); return 1; } + + #pragma omp parallel + { + num_threads = omp_get_num_threads(); + } /***** * BEGIN MAIN LOOP @@ -183,7 +189,7 @@ int main(int argc, char ** argv) n_matches[i] = 0; - column = (int *)malloc((size_pattern+1) * n_bytes * sizeof(int)); + column = (int *)malloc((size_pattern+1) * num_threads * sizeof(int)); if (column == NULL) { fprintf(stderr, @@ -212,7 +218,7 @@ int main(int argc, char ** argv) } distance = levenshtein(pattern[i], &buf[j], size_pattern, - column+(j*(size_pattern+1))); + column+(omp_get_thread_num()*(size_pattern+1))); if (distance <= approx_factor) {