Skip to content
Extraits de code Groupes Projets
Valider 132b2166 rédigé par Nicolas MARIE's avatar Nicolas MARIE
Parcourir les fichiers

added shared memory supprot to gpu and fixe gpu memory limit on long match string

parent baa9a9ab
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -2,9 +2,13 @@ ...@@ -2,9 +2,13 @@
files=("dna/vicPac2_1.fa" "dna/vicPac2_2.fa" "dna/vicPac2_3.fa") files=("dna/vicPac2_1.fa" "dna/vicPac2_2.fa" "dna/vicPac2_3.fa")
matches=("12345" "1234567890" "12345678901234567890") matches=("12345" "1234567890" "12345678901234567890" "123456789012345678901234567890")
runs=("apm" "apm_omp" "apm_gpu") runs=("apm" "apm_omp" "apm_gpu")
timeout=3 timeout=30
timeout_file="timeout.txt"
rm -f "${timeout_file}"
touch "${timeout_file}"
echo -n "# test iteration, file_size (o), match_size (o), version," >&2 echo -n "# test iteration, file_size (o), match_size (o), version," >&2
echo " time (s) (timeout = ${timeout})" >&2 echo " time (s) (timeout = ${timeout})" >&2
...@@ -18,15 +22,36 @@ do ...@@ -18,15 +22,36 @@ do
for exe in ${runs[@]} for exe in ${runs[@]}
do do
( (
echo -n "${i}, $(stat -c %s ${f}), ${#m}, ${exe}, " fs=$(stat -c %s ${f})
ms=${#m}
do_time_out=0
while read -r line
do
texe=$(echo ${line} | cut -d '|' -f 1)
tfs=$(echo ${line} | cut -d '|' -f 2)
tms=$(echo ${line} | cut -d '|' -f 3)
if [ ${texe} = ${exe} -a ${tfs} -le ${fs} -a ${tms} -le ${ms} ]
then
do_time_out=$[${do_time_out} + 1]
fi
done < "${timeout_file}"
echo -n "${i}, ${fs}, ${ms}, ${exe}, "
if [ ${do_time_out} -gt 0 ]
then
echo "speculative timeout"
else
( (
timeout --foreground ${timeout} ./${exe} 3 ${f} ${m} timeout --foreground ${timeout} ./${exe} 3 ${f} ${m}
if [ ${?} -eq 124 ] if [ ${?} -eq 124 ]
then then
echo "timeout" echo "timeout"
echo "${exe}|${fs}|${ms}" >> "${timeout_file}"
fi fi
) \ ) \
| sed -n "s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p" | sed -n \
"s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p"
fi
) 2>&1 ) 2>&1
done done
done done
......
...@@ -170,8 +170,8 @@ levenshtein(char *s1, char *s2, int len, int *column, int approx_factor) ...@@ -170,8 +170,8 @@ levenshtein(char *s1, char *s2, int len, int *column, int approx_factor)
} }
__global__ void __global__ void
levenshtein_cu(char *find, char *buf, int len, int n_bytes levenshtein_cu(char *find, char *buf, int len, int n_bytes,
, int approx_factor, int *g_column, int *result) int approx_factor, int *result)
{ {
int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id
...@@ -183,7 +183,9 @@ levenshtein_cu(char *find, char *buf, int len, int n_bytes ...@@ -183,7 +183,9 @@ levenshtein_cu(char *find, char *buf, int len, int n_bytes
//position s2 and column to the right position in the pre-allocated //position s2 and column to the right position in the pre-allocated
//arrays //arrays
char *s2 = buf + tId; char *s2 = buf + tId;
int *column = g_column + tId * (len + 1); //int *column = g_column + tId * (len + 1);
extern __shared__ int g_column[];
int *column = g_column + threadIdx.x * (len + 1);
int x, y, lastdiag, olddiag; int x, y, lastdiag, olddiag;
...@@ -234,7 +236,7 @@ main(int argc, char **argv) ...@@ -234,7 +236,7 @@ main(int argc, char **argv)
int *n_matches; int *n_matches;
//cuda-related vars //cuda-related vars
char *buf_dev; char *buf_dev;
int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks //int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks
int NB = 0;//Number of blocks int NB = 0;//Number of blocks
/* Check number of arguments */ /* Check number of arguments */
...@@ -334,9 +336,20 @@ main(int argc, char **argv) ...@@ -334,9 +336,20 @@ main(int argc, char **argv)
cudaMalloc((void **)&result_dev, sizeof(int)); cudaMalloc((void **)&result_dev, sizeof(int));
__cudaCheckErrors("Unable to allocate result on device"); __cudaCheckErrors("Unable to allocate result on device");
int maxsharedmem;
cudaDeviceGetAttribute(&maxsharedmem, cudaDevAttrMaxSharedMemoryPerBlock, 0);
int NTBB = MIN3(maxsharedmem / ((size_pattern + 1) * sizeof(int)),
NUMBER_THREADS_BY_BLOCK, NUMBER_THREADS_BY_BLOCK);
int Nshared = (size_pattern + 1) * NTBB * sizeof(int);
off_t offset = 0; #if APM_DEBUG
printf("NTBB: %d\n", NTBB);
printf("maxsharedmem:%d\n", maxsharedmem);
printf("shared per block: %ld\n", (size_pattern + 1) * sizeof(int));
printf("Shard memory: %d\n", Nshared);
#endif
off_t offset = 0;
while (offset < filesize) while (offset < filesize)
{ {
#if APM_DEBUG #if APM_DEBUG
...@@ -355,13 +368,7 @@ main(int argc, char **argv) ...@@ -355,13 +368,7 @@ main(int argc, char **argv)
cudaMemcpy(buf_dev, buf, n_bytes, cudaMemcpyHostToDevice); cudaMemcpy(buf_dev, buf, n_bytes, cudaMemcpyHostToDevice);
__cudaCheckErrors("Unable to copy buffer onto device"); __cudaCheckErrors("Unable to copy buffer onto device");
NB = (n_bytes / NTBB) + (((n_bytes % NTBB) > 0) ? 1 : 0); NB = (n_bytes / NTBB) + (((n_bytes % NTBB) > 0) ? 1 : 0);
int *column_dev;
cudaMalloc((void **)&column_dev,
(size_pattern + 1)*NTBB * NB * sizeof(int));
__cudaCheckErrors("Unable to allocate column vector on device");
char *result_vec_dev; //result vectors. char *result_vec_dev; //result vectors.
cudaMalloc((void **)&result_vec_dev, NTBB * NB * sizeof(char)); cudaMalloc((void **)&result_vec_dev, NTBB * NB * sizeof(char));
...@@ -375,11 +382,11 @@ main(int argc, char **argv) ...@@ -375,11 +382,11 @@ main(int argc, char **argv)
return 1; return 1;
} }
levenshtein_cu <<< NB, NTBB>>>(pattern_dev, buf_dev, size_pattern, levenshtein_cu <<< NB, NTBB, Nshared>>>(
n_bytes, approx_factor, column_dev, result_dev); pattern_dev, buf_dev, size_pattern,
n_bytes, approx_factor, result_dev);
__cudaCheckErrors("Kernel execution of levenshtein_cu failed"); __cudaCheckErrors("Kernel execution of levenshtein_cu failed");
//get result //get result
cudaMemcpy(&result, result_dev, cudaMemcpy(&result, result_dev,
sizeof(int), cudaMemcpyDeviceToHost); sizeof(int), cudaMemcpyDeviceToHost);
...@@ -390,12 +397,6 @@ main(int argc, char **argv) ...@@ -390,12 +397,6 @@ main(int argc, char **argv)
offset += (MAX_BUFFER_SIZE - size_pattern + 1); offset += (MAX_BUFFER_SIZE - size_pattern + 1);
cudaFree(buf_dev); cudaFree(buf_dev);
__cudaCheckErrors("Unable to free memory for dev on device"); __cudaCheckErrors("Unable to free memory for dev on device");
free(buf);
cudaFree(column_dev);
__cudaCheckErrors("Unable to free memory for column on device");
} }
//free memory - and then get onto the next pattern. //free memory - and then get onto the next pattern.
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter