Skip to content
Extraits de code Groupes Projets
Valider 132b2166 rédigé par Nicolas MARIE's avatar Nicolas MARIE
Parcourir les fichiers

added shared memory supprot to gpu and fixe gpu memory limit on long match string

parent baa9a9ab
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
......@@ -2,9 +2,13 @@
files=("dna/vicPac2_1.fa" "dna/vicPac2_2.fa" "dna/vicPac2_3.fa")
matches=("12345" "1234567890" "12345678901234567890")
matches=("12345" "1234567890" "12345678901234567890" "123456789012345678901234567890")
runs=("apm" "apm_omp" "apm_gpu")
timeout=3
timeout=30
timeout_file="timeout.txt"
rm -f "${timeout_file}"
touch "${timeout_file}"
echo -n "# test iteration, file_size (o), match_size (o), version," >&2
echo " time (s) (timeout = ${timeout})" >&2
......@@ -18,15 +22,36 @@ do
for exe in ${runs[@]}
do
(
echo -n "${i}, $(stat -c %s ${f}), ${#m}, ${exe}, "
fs=$(stat -c %s ${f})
ms=${#m}
do_time_out=0
while read -r line
do
texe=$(echo ${line} | cut -d '|' -f 1)
tfs=$(echo ${line} | cut -d '|' -f 2)
tms=$(echo ${line} | cut -d '|' -f 3)
if [ ${texe} = ${exe} -a ${tfs} -le ${fs} -a ${tms} -le ${ms} ]
then
do_time_out=$[${do_time_out} + 1]
fi
done < "${timeout_file}"
echo -n "${i}, ${fs}, ${ms}, ${exe}, "
if [ ${do_time_out} -gt 0 ]
then
echo "speculative timeout"
else
(
timeout --foreground ${timeout} ./${exe} 3 ${f} ${m}
if [ ${?} -eq 124 ]
then
echo "timeout"
echo "${exe}|${fs}|${ms}" >> "${timeout_file}"
fi
) \
| sed -n "s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p"
| sed -n \
"s/APM done in \([0-9\.]*\) s\|\(timeout\)/\1\2/p"
fi
) 2>&1
done
done
......
......@@ -170,8 +170,8 @@ levenshtein(char *s1, char *s2, int len, int *column, int approx_factor)
}
__global__ void
levenshtein_cu(char *find, char *buf, int len, int n_bytes
, int approx_factor, int *g_column, int *result)
levenshtein_cu(char *find, char *buf, int len, int n_bytes,
int approx_factor, int *result)
{
int tId = blockIdx.x * blockDim.x + threadIdx.x;//global thread id
......@@ -183,7 +183,9 @@ levenshtein_cu(char *find, char *buf, int len, int n_bytes
//position s2 and column to the right position in the pre-allocated
//arrays
char *s2 = buf + tId;
int *column = g_column + tId * (len + 1);
//int *column = g_column + tId * (len + 1);
extern __shared__ int g_column[];
int *column = g_column + threadIdx.x * (len + 1);
int x, y, lastdiag, olddiag;
......@@ -234,7 +236,7 @@ main(int argc, char **argv)
int *n_matches;
//cuda-related vars
char *buf_dev;
int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks
//int NTBB = NUMBER_THREADS_BY_BLOCK; //Number of threads by blocks
int NB = 0;//Number of blocks
/* Check number of arguments */
......@@ -334,9 +336,20 @@ main(int argc, char **argv)
cudaMalloc((void **)&result_dev, sizeof(int));
__cudaCheckErrors("Unable to allocate result on device");
int maxsharedmem;
cudaDeviceGetAttribute(&maxsharedmem, cudaDevAttrMaxSharedMemoryPerBlock, 0);
int NTBB = MIN3(maxsharedmem / ((size_pattern + 1) * sizeof(int)),
NUMBER_THREADS_BY_BLOCK, NUMBER_THREADS_BY_BLOCK);
int Nshared = (size_pattern + 1) * NTBB * sizeof(int);
off_t offset = 0;
#if APM_DEBUG
printf("NTBB: %d\n", NTBB);
printf("maxsharedmem:%d\n", maxsharedmem);
printf("shared per block: %ld\n", (size_pattern + 1) * sizeof(int));
printf("Shard memory: %d\n", Nshared);
#endif
off_t offset = 0;
while (offset < filesize)
{
#if APM_DEBUG
......@@ -355,13 +368,7 @@ main(int argc, char **argv)
cudaMemcpy(buf_dev, buf, n_bytes, cudaMemcpyHostToDevice);
__cudaCheckErrors("Unable to copy buffer onto device");
NB = (n_bytes / NTBB) + (((n_bytes % NTBB) > 0) ? 1 : 0);
int *column_dev;
cudaMalloc((void **)&column_dev,
(size_pattern + 1)*NTBB * NB * sizeof(int));
__cudaCheckErrors("Unable to allocate column vector on device");
char *result_vec_dev; //result vectors.
cudaMalloc((void **)&result_vec_dev, NTBB * NB * sizeof(char));
......@@ -375,11 +382,11 @@ main(int argc, char **argv)
return 1;
}
levenshtein_cu <<< NB, NTBB>>>(pattern_dev, buf_dev, size_pattern,
n_bytes, approx_factor, column_dev, result_dev);
levenshtein_cu <<< NB, NTBB, Nshared>>>(
pattern_dev, buf_dev, size_pattern,
n_bytes, approx_factor, result_dev);
__cudaCheckErrors("Kernel execution of levenshtein_cu failed");
//get result
cudaMemcpy(&result, result_dev,
sizeof(int), cudaMemcpyDeviceToHost);
......@@ -390,12 +397,6 @@ main(int argc, char **argv)
offset += (MAX_BUFFER_SIZE - size_pattern + 1);
cudaFree(buf_dev);
__cudaCheckErrors("Unable to free memory for dev on device");
free(buf);
cudaFree(column_dev);
__cudaCheckErrors("Unable to free memory for column on device");
}
//free memory - and then get onto the next pattern.
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter