added shared memory supprot to gpu and fixe gpu memory limit on long match string

132b2166 · Nicolas MARIE · baa9a9ab · 132b2166 · 132b2166
--- a/Projet/CODE/apm/bench.sh
+++ b/Projet/CODE/apm/bench.sh
@@ -2,9 +2,13 @@
 files=("dna/vicPac2_1.fa" "dna/vicPac2_2.fa" "dna/vicPac2_3.fa")
-matches=("12345" "1234567890" "12345678901234567890")
+matches=("12345" "1234567890" "12345678901234567890" "123456789012345678901234567890")
 runs=("apm" "apm_omp" "apm_gpu")
-timeout=3
+timeout=30
+timeout_file="timeout.txt"
+rm -f "${timeout_file}"
+touch "${timeout_file}"
 echo -n "# test iteration, file_size (o), match_size (o), version," >&2
 echo " time (s) (timeout = ${timeout})" >&2
@@ -18,15 +22,36 @@ do
 			for exe in ${runs[@]}
 			do
 				(
-					echo -n "${i}, $(stat -c %s ${f}), ${#m}, ${exe}, "
+					fs=$(stat -c %s ${f})
+					ms=${#m}
+					do_time_out=0
+					while read -r line
+					do
+						texe=$(echo ${line} | cut -d '|' -f 1)
+						tfs=$(echo ${line} | cut -d '|' -f 2)
+						tms=$(echo ${line} | cut -d '|' -f 3)
+						if [ ${texe} = ${exe} -a ${tfs} -le ${fs} -a ${tms} -le ${ms} ]
+						then
+							do_time_out=$[${do_time_out} + 1]
+						fi
+					done < "${timeout_file}"
+					echo -n "${i}, ${fs}, ${ms}, ${exe}, "
+					if [ ${do_time_out} -gt 0 ]
+					then
+						echo "speculative timeout"
+					else
 						(
 							timeout --foreground ${timeout} ./${exe} 3 ${f} ${m}
 							if [ ${?} -eq 124 ]
 							then
 								echo "timeout"
+								echo "${exe}|${fs}|${ms}" >> "${timeout_file}"
 							fi
 						) \
-					| sed -n "s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p"
+						| sed -n \
+						"s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p"
+					fi
 				) 2>&1
 			done
 		done

--- a/Projet/CODE/apm/src/apm_gpu.cu
+++ b/Projet/CODE/apm/src/apm_gpu.cu
@@ -170,8 +170,8 @@ levenshtein(char *s1, char *s2, int len, int *column, int approx_factor)
 }
 __global__ void
-levenshtein_cu(char *find, char *buf, int len, int n_bytes
+levenshtein_cu(char *find, char *buf, int len, int n_bytes,
-        , int approx_factor, int *g_column, int *result)
+        int approx_factor, int *result)
 {
 	int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id
@@ -183,7 +183,9 @@ levenshtein_cu(char *find, char *buf, int len, int n_bytes
 	//position s2 and column to the right position in the pre-allocated
 	//arrays
 	char *s2 = buf + tId;
-	int *column = g_column + tId * (len + 1);
+	//int *column = g_column + tId * (len + 1);
+	extern __shared__ int g_column[];
+	int *column = g_column + threadIdx.x * (len + 1);
 	int x, y, lastdiag, olddiag;
@@ -234,7 +236,7 @@ main(int argc, char **argv)
 	int *n_matches;
 	//cuda-related vars
 	char *buf_dev;
-	int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks
+	//int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks
 	int NB = 0;//Number of blocks
 	/* Check number of arguments */
@@ -334,9 +336,20 @@ main(int argc, char **argv)
 		cudaMalloc((void **)&result_dev, sizeof(int));
 		__cudaCheckErrors("Unable to allocate result on device");
+		int maxsharedmem;
+		cudaDeviceGetAttribute(&maxsharedmem, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+		int NTBB = MIN3(maxsharedmem / ((size_pattern + 1) * sizeof(int)),
+				NUMBER_THREADS_BY_BLOCK, NUMBER_THREADS_BY_BLOCK);
+		int Nshared = (size_pattern + 1) * NTBB * sizeof(int);
-		off_t offset = 0;
+#if APM_DEBUG
+		printf("NTBB: %d\n", NTBB);
+		printf("maxsharedmem:%d\n", maxsharedmem);
+		printf("shared per block: %ld\n", (size_pattern + 1) * sizeof(int));
+		printf("Shard memory: %d\n", Nshared);
+#endif
+		off_t offset = 0;
 		while (offset < filesize)
 		{
 #if APM_DEBUG
@@ -355,13 +368,7 @@ main(int argc, char **argv)
 			cudaMemcpy(buf_dev, buf, n_bytes, cudaMemcpyHostToDevice);
 			__cudaCheckErrors("Unable to copy buffer onto device");
 			NB = (n_bytes / NTBB) + (((n_bytes % NTBB) > 0) ? 1 : 0);
-			int *column_dev;
-			cudaMalloc((void **)&column_dev,
-			        (size_pattern + 1)*NTBB * NB * sizeof(int));
-			__cudaCheckErrors("Unable to allocate column vector on device");
 			char *result_vec_dev; //result vectors.
 			cudaMalloc((void **)&result_vec_dev, NTBB * NB * sizeof(char));
@@ -375,11 +382,11 @@ main(int argc, char **argv)
 				return 1;
 			}
-			levenshtein_cu <<< NB, NTBB>>>(pattern_dev, buf_dev, size_pattern,
+			levenshtein_cu <<< NB, NTBB, Nshared>>>(
-			        n_bytes, approx_factor, column_dev, result_dev);
+					pattern_dev, buf_dev, size_pattern,
+			        n_bytes, approx_factor, result_dev);
 			__cudaCheckErrors("Kernel execution of levenshtein_cu failed");
 			//get result
 			cudaMemcpy(&result, result_dev,
 			        sizeof(int), cudaMemcpyDeviceToHost);
@@ -390,12 +397,6 @@ main(int argc, char **argv)
 			offset += (MAX_BUFFER_SIZE - size_pattern + 1);
 			cudaFree(buf_dev);
 			__cudaCheckErrors("Unable to free memory for dev on device");
-			free(buf);
-			cudaFree(column_dev);
-			__cudaCheckErrors("Unable to free memory for column on device");
 		}
 		//free memory - and then get onto the next pattern.