diff --git a/Projet/CODE/apm/Makefile b/Projet/CODE/apm/Makefile index 2b3cde15d79f8095b5f8690d205ccb3eff2abcb6..02bc6ae708963cd043ab1275933f66a90f0589e1 100644 --- a/Projet/CODE/apm/Makefile +++ b/Projet/CODE/apm/Makefile @@ -24,6 +24,9 @@ apm_gpu: src/apm_gpu.cu # you need to create the file dna/test.fa test: all + ./apm 0 dna/small_chrY.fa TTGA TTGA TTGA + ./apm_omp 0 dna/small_chrY.fa TTGA TTGA TTGA + ./apm_gpu 0 dna/small_chrY.fa TTGA TTGA TTGA ./apm 0 dna/test.fa test123 ./apm_omp 0 dna/test.fa test123 ./apm_gpu 0 dna/test.fa test123 @@ -31,6 +34,7 @@ test: all ./apm_omp 0 dna/xenoRefMrna.fa ttggaa ./apm_gpu 0 dna/xenoRefMrna.fa ttggaa + .PHONY: clean clean: rm -rf apm apm_omp apm_gpu diff --git a/Projet/CODE/apm/src/apm.c b/Projet/CODE/apm/src/apm.c index 802e016712c498e6804d2c5ccda1b566ddd0c1a7..1847f6b6767078fd412c82964f1d02725bd70d15 100644 --- a/Projet/CODE/apm/src/apm.c +++ b/Projet/CODE/apm/src/apm.c @@ -14,6 +14,11 @@ #define APM_DEBUG 0 +#define MAX_BUFFER_SIZE 1024 * 1024 * 1024 // 1Go per reads at most +//#define MAX_BUFFER_SIZE 512 // debug + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MIN3(a, b, c) ((a)<(b) ? ((a)<(c) ? (a) : (c)) : ((b)<(c) ? (b) : (c))) // can't be inline as it is a variadic function void @@ -31,55 +36,72 @@ __checkErrors(const int isBad, const char *fmt, ...) } } +struct openfile +{ + int fd; // file descriptor + size_t size; // size of the file + size_t buf_start; // index of the start of the buffer in the file + char *buf; // buffer being read + unsigned int buf_alloc_size; // allocateed size of the buffer + unsigned int buf_size; // size of the datas in the buffer +}; -char * -read_input_file(char *filename, size_t *size) +void +open_input_file(struct openfile *file, char *filename) { - char *buf; - struct stat fs; - size_t fsize; - int fd = 0; - size_t read_bytes = 0; - size_t total_bytes = 0; - /* Open the text file */ - fd = open(filename, O_RDONLY); - __checkErrors(fd == -1, "Unable to open the text file <%s>\n", filename); + // openning the file + file->fd = open(filename, O_RDONLY); + __checkErrors(file->fd < 0, "Unable to open the text file <%s>\n", filename); - /* Get the number of characters in the textfile */ - int stat = fstat(fd, &fs); - __checkErrors(stat < 0, "Stat of file <%s> failed\n", filename); - fsize = fs.st_size; + // size of the file + struct stat fs; + int res = fstat(file->fd, &fs); + __checkErrors(res < 0, "Stat of file <%s> failed\n", filename); + file->size = fs.st_size; + + // buffer allocation + file->buf_alloc_size = MIN(file->size, MAX_BUFFER_SIZE); + file->buf = (char *) malloc(file->buf_alloc_size * sizeof(char)); + __checkErrors(file->buf == NULL, + "Unable to allocate %ld byte(s) for main array\n", + file->buf_alloc_size); + + file->buf_start = 0; + file->buf_size = 0; +} -#if APM_DEBUG - printf("File length: %ld\n", fsize); -#endif +int +read_input_file(struct openfile *file, size_t recovering) +{ + // how mutch do we realy go back + size_t go_back = MIN(file->buf_start, recovering); - /* Allocate data to copy the target text */ - buf = malloc(fsize * sizeof(char)); - __checkErrors(buf == NULL, - "Unable to allocate %ld byte(s) for main array\n", fsize); + // we read the min between the max size of the buffer + // and what is left in the file + int to_read = MIN(file->buf_alloc_size - go_back, + file->size - file->buf_start); - do - { - read_bytes = read(fd, buf, fsize - total_bytes); - total_bytes += read_bytes; - } - while (read_bytes != 0); - - __checkErrors(total_bytes != fsize, "Unable to copy %ld byte(s) " - "from text file (%ld byte(s) copied)\n", fsize, total_bytes); + // we copy the recovery from the end of the buffer + memcpy(file->buf, file->buf + file->buf_size - go_back, go_back); -#if APM_DEBUG - printf("Number of read bytes: %ld\n", total_bytes); -#endif + // we read the file + int read_size = read(file->fd, file->buf + go_back, to_read); + __checkErrors(read_size != to_read, "Unable to copy %ld byte(s) " + "from text file (%ld byte(s) copied)\n", to_read, read_size); + file->buf_start += read_size; + file->buf_size = go_back + read_size; - *size = total_bytes; - close(fd); - return buf; + return read_size; } -#define MIN3(a, b, c) ((a)<(b) ? ((a)<(c) ? (a) : (c)) : ((b)<(c) ? (b) : (c))) +void +close_input_file(struct openfile *file) +{ + int res = close(file->fd); + __checkErrors(res < 0, "Faile to close file descriptor (%d)\n.", file->fd); + free(file->buf); +} int levenshtein(char *s1, char *s2, unsigned int len, unsigned int *column) @@ -113,65 +135,77 @@ levenshtein(char *s1, char *s2, unsigned int len, unsigned int *column) int main(int argc, char **argv) { - char **pattern; - char *filename; + // arguemnts int approx_factor = 0; - size_t nb_patterns = 0; - size_t i, j; - char *buf; + char *filename; + int nb_patterns = 0; + char **pattern; + + // time interval and duration for time measurement struct timeval t1, t2; double duration; - size_t n_bytes; - size_t *n_matches; - /* Check number of arguments */ - if (argc < 4) - { - printf("Usage: %s approximation_factor " - "dna_database pattern1 pattern2 ...\n", - argv[0]); - return 1; - } + // result matches + int *n_matches; - approx_factor = atoi(argv[1]);/* Get the distance factor */ - filename = argv[2];/* Grab the filename containing the target text */ - nb_patterns = argc - 3;/* Get the number of patterns to search for */ - pattern = malloc(nb_patterns * sizeof(char *)); - __checkErrors(pattern == NULL, "Unable to allocate array " - "of pattern of size %ld\n", nb_patterns); + // file + struct openfile file; - for (i = 0; i < nb_patterns; i++) /* Grab the patterns */ + // Arguments parsing { - int l; - l = strlen(argv[i + 3]); - __checkErrors(l <= 0, "Error while parsing argument %ld\n", i + 3); + if (argc < 4) + { + printf("Usage: %s approximation_factor " + "dna_database pattern1 pattern2 ...\n", + argv[0]); + return 1; + } - pattern[i] = (char *)malloc((l + 1) * sizeof(char)); - __checkErrors(pattern[i] == NULL, "Unable to allocate string " - "of size %d\n", l); + // reading argument + approx_factor = atoi(argv[1]);/* Get the distance factor */ + filename = argv[2];/* Grab the filename containing the target text */ + nb_patterns = argc - 3;/* Get the number of patterns to search for */ + pattern = malloc(nb_patterns * sizeof(char *)); + __checkErrors(pattern == NULL, "Unable to allocate array " + "of pattern of size %ld\n", nb_patterns); + + // copying patterns + for (int i = 0; i < nb_patterns; i++) /* Grab the patterns */ + { + int l = strlen(argv[i + 3]); + __checkErrors(l <= 0, "Error while parsing argument %ld\n", i + 3); + + pattern[i] = (char *) malloc((l + 1) * sizeof(char)); + __checkErrors(pattern[i] == NULL, "Unable to allocate string " + "of size %d\n", l); + + strncpy(pattern[i], argv[i + 3], (l + 1)); + } + } - strncpy(pattern[i], argv[i + 3], (l + 1)); + // results allocations + { + n_matches = malloc(nb_patterns * sizeof(size_t)); + __checkErrors(n_matches == NULL, "Error: unable to " + "allocate memory for %ldB\n", nb_patterns * sizeof(int)); } printf("Approximate Pattern Mathing: " - "looking for %ld pattern(s) in file %s w/ distance of %d\n", + "looking for %d pattern(s) in file %s w/ distance of %d\n", nb_patterns, filename, approx_factor); - buf = read_input_file(filename, &n_bytes); - __checkErrors(buf == NULL, "Error: NULL pointer from reading input file."); - - n_matches = malloc(nb_patterns * sizeof(size_t));/*Alloc the matches*/ - __checkErrors(n_matches == NULL, "Error: unable to " - "allocate memory for %ldB\n", nb_patterns * sizeof(int)); - /***** * BEGIN MAIN LOOP ******/ + /* Timer start */ gettimeofday(&t1, NULL); - for (i = 0; i < nb_patterns; i++) + for (int i = 0; i < nb_patterns; i++) { + // file initialisation + open_input_file(&file, filename); + size_t size_pattern = strlen(pattern[i]); unsigned int *column; n_matches[i] = 0; @@ -180,27 +214,31 @@ main(int argc, char **argv) "allocate memory for column (%ldB)\n", (size_pattern + 1) * sizeof(unsigned int)); - for (j = 0; j < n_bytes - size_pattern; j++) + while (read_input_file(&file, size_pattern)) { - int distance = 0; + for (unsigned int j = 0; j < file.buf_size - size_pattern; j++) + { + int distance = 0; #if APM_DEBUG - if (j % (n_bytes / 100) == 0) - { - printf("Procesing byte %ld (out of %ld)(%ld%%)\n", - j, n_bytes, j / (n_bytes / 100)); - } + if (j % (file.size / 100) == 0) + { + printf("Procesing byte %d (out of %ld)(%ld%%)\n", + j, file.size, j / (file.size / 100)); + } #endif + distance = levenshtein(pattern[i], &file.buf[j], + size_pattern, column); - distance = levenshtein(pattern[i], &buf[j], size_pattern, column); - - if (distance <= approx_factor) - { - n_matches[i]++; + if (distance <= approx_factor) + { + n_matches[i]++; + } } } + close_input_file(&file); free(column); } @@ -213,9 +251,9 @@ main(int argc, char **argv) * END MAIN LOOP ******/ - for (i = 0; i < nb_patterns; i++) + for (int i = 0; i < nb_patterns; i++) { - printf("Number of matches for pattern <%s>: %ld\n", + printf("Number of matches for pattern <%s>: %d\n", pattern[i], n_matches[i]); } diff --git a/Projet/CODE/apm/src/apm_omp.c b/Projet/CODE/apm/src/apm_omp.c index 4cc6499fadbf020d53215653c18b25df513a03f2..0893108d9275618a1cb3211e483a526ab4902b30 100644 --- a/Projet/CODE/apm/src/apm_omp.c +++ b/Projet/CODE/apm/src/apm_omp.c @@ -15,6 +15,11 @@ #include "omp.h" #define APM_DEBUG 0 +#define MAX_BUFFER_SIZE 1024 * 1024 * 1024 // 1Go per reads at most +//#define MAX_BUFFER_SIZE 512 // debug + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MIN3(a, b, c) ((a)<(b) ? ((a)<(c) ? (a) : (c)) : ((b)<(c) ? (b) : (c))) // can't be inline as it is a variadic function void @@ -32,55 +37,72 @@ __checkErrors(const int isBad, const char *fmt, ...) } } +struct openfile +{ + int fd; // file descriptor + size_t size; // size of the file + size_t buf_start; // index of the start of the buffer in the file + char *buf; // buffer being read + unsigned int buf_alloc_size; // allocateed size of the buffer + unsigned int buf_size; // size of the datas in the buffer +}; -char * -read_input_file(char *filename, size_t *size) +void +open_input_file(struct openfile *file, char *filename) { - char *buf; - struct stat fs; - size_t fsize; - int fd = 0; - size_t read_bytes = 0; - size_t total_bytes = 0; - /* Open the text file */ - fd = open(filename, O_RDONLY); - __checkErrors(fd == -1, "Unable to open the text file <%s>\n", filename); + // openning the file + file->fd = open(filename, O_RDONLY); + __checkErrors(file->fd < 0, "Unable to open the text file <%s>\n", filename); - /* Get the number of characters in the textfile */ - int stat = fstat(fd, &fs); - __checkErrors(stat < 0, "Stat of file <%s> failed\n", filename); - fsize = fs.st_size; + // size of the file + struct stat fs; + int res = fstat(file->fd, &fs); + __checkErrors(res < 0, "Stat of file <%s> failed\n", filename); + file->size = fs.st_size; + + // buffer allocation + file->buf_alloc_size = MIN(file->size, MAX_BUFFER_SIZE); + file->buf = (char *) malloc(file->buf_alloc_size * sizeof(char)); + __checkErrors(file->buf == NULL, + "Unable to allocate %ld byte(s) for main array\n", + file->buf_alloc_size); + + file->buf_start = 0; + file->buf_size = 0; +} -#if APM_DEBUG - printf("File length: %ld\n", fsize); -#endif +int +read_input_file(struct openfile *file, size_t recovering) +{ + // how mutch do we realy go back + size_t go_back = MIN(file->buf_start, recovering); - /* Allocate data to copy the target text */ - buf = malloc(fsize * sizeof(char)); - __checkErrors(buf == NULL, - "Unable to allocate %ld byte(s) for main array\n", fsize); + // we read the min between the max size of the buffer + // and what is left in the file + int to_read = MIN(file->buf_alloc_size - go_back, + file->size - file->buf_start); - do - { - read_bytes = read(fd, buf, fsize - total_bytes); - total_bytes += read_bytes; - } - while (read_bytes != 0); + // we copy the recovery from the end of the buffer + memcpy(file->buf, file->buf + file->buf_size - go_back, go_back); - __checkErrors(total_bytes != fsize, "Unable to copy %ld byte(s) " - "from text file (%ld byte(s) copied)\n", fsize, total_bytes); + // we read the file + int read_size = read(file->fd, file->buf + go_back, to_read); + __checkErrors(read_size != to_read, "Unable to copy %ld byte(s) " + "from text file (%ld byte(s) copied)\n", to_read, read_size); + file->buf_start += read_size; + file->buf_size = go_back + read_size; -#if APM_DEBUG - printf("Number of read bytes: %ld\n", total_bytes); -#endif - - *size = total_bytes; - close(fd); - return buf; + return read_size; } -#define MIN3(a, b, c) ((a)<(b) ? ((a)<(c) ? (a) : (c)) : ((b)<(c) ? (b) : (c))) +void +close_input_file(struct openfile *file) +{ + int res = close(file->fd); + __checkErrors(res < 0, "Faile to close file descriptor (%d)\n.", file->fd); + free(file->buf); +} int levenshtein(char *s1, char *s2, unsigned int len, unsigned int *column) @@ -114,105 +136,122 @@ levenshtein(char *s1, char *s2, unsigned int len, unsigned int *column) int main(int argc, char **argv) { - char **pattern; - char *filename; + // arguemnts int approx_factor = 0; - size_t nb_patterns = 0; - size_t i, j; - char *buf; + char *filename; + int nb_patterns = 0; + char **pattern; + + // time interval and duration for time measurement struct timeval t1, t2; double duration; - size_t n_bytes; - size_t *n_matches; - /* Check number of arguments */ - if (argc < 4) - { - printf("Usage: %s approximation_factor " - "dna_database pattern1 pattern2 ...\n", - argv[0]); - return 1; - } + // result matches + int *n_matches; - approx_factor = atoi(argv[1]);/* Get the distance factor */ - filename = argv[2];/* Grab the filename containing the target text */ - nb_patterns = argc - 3;/* Get the number of patterns to search for */ - pattern = malloc(nb_patterns * sizeof(char *)); - __checkErrors(pattern == NULL, "Unable to allocate array " - "of pattern of size %ld\n", nb_patterns); + // file + struct openfile file; - for (i = 0; i < nb_patterns; i++) /* Grab the patterns */ + // Arguments parsing { - int l; - l = strlen(argv[i + 3]); - __checkErrors(l <= 0, "Error while parsing argument %ld\n", i + 3); + if (argc < 4) + { + printf("Usage: %s approximation_factor " + "dna_database pattern1 pattern2 ...\n", + argv[0]); + return 1; + } + + // reading argument + approx_factor = atoi(argv[1]);/* Get the distance factor */ + filename = argv[2];/* Grab the filename containing the target text */ + nb_patterns = argc - 3;/* Get the number of patterns to search for */ + pattern = malloc(nb_patterns * sizeof(char *)); + __checkErrors(pattern == NULL, "Unable to allocate array " + "of pattern of size %ld\n", nb_patterns); + + // copying patterns + for (int i = 0; i < nb_patterns; i++) /* Grab the patterns */ + { + int l = strlen(argv[i + 3]); + __checkErrors(l <= 0, "Error while parsing argument %ld\n", i + 3); + + pattern[i] = (char *) malloc((l + 1) * sizeof(char)); + __checkErrors(pattern[i] == NULL, "Unable to allocate string " + "of size %d\n", l); - pattern[i] = (char *)malloc((l + 1) * sizeof(char)); - __checkErrors(pattern[i] == NULL, "Unable to allocate string " - "of size %d\n", l); + strncpy(pattern[i], argv[i + 3], (l + 1)); + } + } - strncpy(pattern[i], argv[i + 3], (l + 1)); + // results allocations + { + n_matches = malloc(nb_patterns * sizeof(size_t)); + __checkErrors(n_matches == NULL, "Error: unable to " + "allocate memory for %ldB\n", nb_patterns * sizeof(int)); } printf("Approximate Pattern Mathing: " - "looking for %ld pattern(s) in file %s w/ distance of %d\n", + "looking for %d pattern(s) in file %s w/ distance of %d\n", nb_patterns, filename, approx_factor); - buf = read_input_file(filename, &n_bytes); - __checkErrors(buf == NULL, "Error: NULL pointer from reading input file."); - - n_matches = malloc(nb_patterns * sizeof(size_t));/*Alloc the matches*/ - __checkErrors(n_matches == NULL, "Error: unable to " - "allocate memory for %ldB\n", nb_patterns * sizeof(int)); - /***** * BEGIN MAIN LOOP ******/ + /* Timer start */ gettimeofday(&t1, NULL); - for (i = 0; i < nb_patterns; i++) + for (int i = 0; i < nb_patterns; i++) { + // file initialisation + open_input_file(&file, filename); + size_t size_pattern = strlen(pattern[i]); n_matches[i] = 0; - int matches = 0; - #pragma omp parallel reduction(+:matches) + while (read_input_file(&file, size_pattern)) { - unsigned int *column; - column = malloc((size_pattern + 1) * sizeof(unsigned int)); - __checkErrors(column == NULL, - "Error: unable to allocate memory for column (%ldB)\n", - (size_pattern + 1) * sizeof(unsigned int)); + int matches = 0; + #pragma omp parallel reduction(+:matches) + { + unsigned int *column; + column = malloc((size_pattern + 1) * sizeof(unsigned int)); + __checkErrors(column == NULL, + "Error: unable to allocate memory for column (%ldB)\n", + (size_pattern + 1) * sizeof(unsigned int)); - #pragma omp for schedule(dynamic, 40960) + #pragma omp for schedule(dynamic, 40960) - for (j = 0; j < n_bytes - size_pattern; j++) - { - int distance = 0; + for (unsigned int j = 0; j < file.buf_size - size_pattern; j++) + { + int distance = 0; #if APM_DEBUG - if (j % (n_bytes / 100) == 0) - { - printf("Procesing byte %ld (out of %ld)(%ld%%)\n", - j, n_bytes, j / (n_bytes / 100)); - } + if (j % (file.size / 100) == 0) + { + printf("Procesing byte %d (out of %ld)(%ld%%)\n", + j, file.size, j / (file.size / 100)); + } #endif - distance = levenshtein(pattern[i], &buf[j], - size_pattern, column); + distance = levenshtein(pattern[i], &file.buf[j], + size_pattern, column); - if (distance <= approx_factor) - { - matches++; + if (distance <= approx_factor) + { + matches++; + } } - } - free(column); + free(column); + } + n_matches[i] += matches; } - n_matches[i] = matches; + + close_input_file(&file); } /* Timer stop */ @@ -224,9 +263,9 @@ main(int argc, char **argv) * END MAIN LOOP ******/ - for (i = 0; i < nb_patterns; i++) + for (int i = 0; i < nb_patterns; i++) { - printf("Number of matches for pattern <%s>: %ld\n", + printf("Number of matches for pattern <%s>: %d\n", pattern[i], n_matches[i]); }