diff --git a/Projet/CODE/apm/bench.sh b/Projet/CODE/apm/bench.sh index 49f4988fcaad5d3525348c7dc1681c05ebac8c05..6c18c910c41f3c9d226c9d390cf375fe1e19a89a 100755 --- a/Projet/CODE/apm/bench.sh +++ b/Projet/CODE/apm/bench.sh @@ -2,9 +2,13 @@ files=("dna/vicPac2_1.fa" "dna/vicPac2_2.fa" "dna/vicPac2_3.fa") -matches=("12345" "1234567890" "12345678901234567890") +matches=("12345" "1234567890" "12345678901234567890" "123456789012345678901234567890") runs=("apm" "apm_omp" "apm_gpu") -timeout=3 +timeout=30 + +timeout_file="timeout.txt" +rm -f "${timeout_file}" +touch "${timeout_file}" echo -n "# test iteration, file_size (o), match_size (o), version," >&2 echo " time (s) (timeout = ${timeout})" >&2 @@ -18,15 +22,36 @@ do for exe in ${runs[@]} do ( - echo -n "${i}, $(stat -c %s ${f}), ${#m}, ${exe}, " - ( - timeout --foreground ${timeout} ./${exe} 3 ${f} ${m} - if [ ${?} -eq 124 ] + fs=$(stat -c %s ${f}) + ms=${#m} + do_time_out=0 + while read -r line + do + texe=$(echo ${line} | cut -d '|' -f 1) + tfs=$(echo ${line} | cut -d '|' -f 2) + tms=$(echo ${line} | cut -d '|' -f 3) + if [ ${texe} = ${exe} -a ${tfs} -le ${fs} -a ${tms} -le ${ms} ] then - echo "timeout" + do_time_out=$[${do_time_out} + 1] fi - ) \ - | sed -n "s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p" + done < "${timeout_file}" + + echo -n "${i}, ${fs}, ${ms}, ${exe}, " + if [ ${do_time_out} -gt 0 ] + then + echo "speculative timeout" + else + ( + timeout --foreground ${timeout} ./${exe} 3 ${f} ${m} + if [ ${?} -eq 124 ] + then + echo "timeout" + echo "${exe}|${fs}|${ms}" >> "${timeout_file}" + fi + ) \ + | sed -n \ + "s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p" + fi ) 2>&1 done done diff --git a/Projet/CODE/apm/src/apm_gpu.cu b/Projet/CODE/apm/src/apm_gpu.cu index ce4656c7ecdd420128eb326c3da41e6e5414c686..b059087fdbc6cfaa69606a1a2a73439e78d148b3 100644 --- a/Projet/CODE/apm/src/apm_gpu.cu +++ b/Projet/CODE/apm/src/apm_gpu.cu @@ -170,8 +170,8 @@ levenshtein(char *s1, char *s2, int len, int *column, int approx_factor) } __global__ void -levenshtein_cu(char *find, char *buf, int len, int n_bytes - , int approx_factor, int *g_column, int *result) +levenshtein_cu(char *find, char *buf, int len, int n_bytes, + int approx_factor, int *result) { int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id @@ -183,7 +183,9 @@ levenshtein_cu(char *find, char *buf, int len, int n_bytes //position s2 and column to the right position in the pre-allocated //arrays char *s2 = buf + tId; - int *column = g_column + tId * (len + 1); + //int *column = g_column + tId * (len + 1); + extern __shared__ int g_column[]; + int *column = g_column + threadIdx.x * (len + 1); int x, y, lastdiag, olddiag; @@ -234,7 +236,7 @@ main(int argc, char **argv) int *n_matches; //cuda-related vars char *buf_dev; - int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks + //int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks int NB = 0;//Number of blocks /* Check number of arguments */ @@ -334,9 +336,20 @@ main(int argc, char **argv) cudaMalloc((void **)&result_dev, sizeof(int)); __cudaCheckErrors("Unable to allocate result on device"); + int maxsharedmem; + cudaDeviceGetAttribute(&maxsharedmem, cudaDevAttrMaxSharedMemoryPerBlock, 0); + int NTBB = MIN3(maxsharedmem / ((size_pattern + 1) * sizeof(int)), + NUMBER_THREADS_BY_BLOCK, NUMBER_THREADS_BY_BLOCK); + int Nshared = (size_pattern + 1) * NTBB * sizeof(int); - off_t offset = 0; +#if APM_DEBUG + printf("NTBB: %d\n", NTBB); + printf("maxsharedmem:%d\n", maxsharedmem); + printf("shared per block: %ld\n", (size_pattern + 1) * sizeof(int)); + printf("Shard memory: %d\n", Nshared); +#endif + off_t offset = 0; while (offset < filesize) { #if APM_DEBUG @@ -355,13 +368,7 @@ main(int argc, char **argv) cudaMemcpy(buf_dev, buf, n_bytes, cudaMemcpyHostToDevice); __cudaCheckErrors("Unable to copy buffer onto device"); - NB = (n_bytes / NTBB) + (((n_bytes % NTBB) > 0) ? 1 : 0); - int *column_dev; - cudaMalloc((void **)&column_dev, - (size_pattern + 1)*NTBB * NB * sizeof(int)); - __cudaCheckErrors("Unable to allocate column vector on device"); - char *result_vec_dev; //result vectors. cudaMalloc((void **)&result_vec_dev, NTBB * NB * sizeof(char)); @@ -375,11 +382,11 @@ main(int argc, char **argv) return 1; } - levenshtein_cu <<< NB, NTBB>>>(pattern_dev, buf_dev, size_pattern, - n_bytes, approx_factor, column_dev, result_dev); + levenshtein_cu <<< NB, NTBB, Nshared>>>( + pattern_dev, buf_dev, size_pattern, + n_bytes, approx_factor, result_dev); __cudaCheckErrors("Kernel execution of levenshtein_cu failed"); - //get result cudaMemcpy(&result, result_dev, sizeof(int), cudaMemcpyDeviceToHost); @@ -390,12 +397,6 @@ main(int argc, char **argv) offset += (MAX_BUFFER_SIZE - size_pattern + 1); cudaFree(buf_dev); __cudaCheckErrors("Unable to free memory for dev on device"); - - - free(buf); - cudaFree(column_dev); - __cudaCheckErrors("Unable to free memory for column on device"); - } //free memory - and then get onto the next pattern.