diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu index 7e6abed0a36d1032a6a2f178ba302e4f4bfa69f0..3d2ae823aee66c06f3ec31f9ce648510d3f8a097 100755 --- a/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu @@ -33,22 +33,18 @@ int verify_matrix(double *matRef, double *matOut, int N) { } return 0; } +void init(double* A, double* B, double* C, int size) { + int i = 0, j = 0; -void init(double* A, double* B, double* C, int size) -{ - int i = 0, j = 0; - - srand(2019); + srand(2019); - for(i = 0; i < size; i++) - { - for(j = 0; j < size; j++) - { - A[i * size + j] = rand(); - B[i * size + j] = rand(); - C[i * size + j] = 0.0; - } - } + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + C[i * size + j] = 0.0; + } + } } void mult(double* A, double* B, double* C, int size) diff --git a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu index 459789977f5d39de2aba3e4bbd9990cf23e54a2a..036df166c7152add35715c4dd45e18e11f4d2c8c 100755 --- a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu +++ b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu @@ -34,21 +34,18 @@ int verify_matrix(double *matRef, double *matOut, int N) { return 0; } -void init(double* A, double* B, double* C, int size) -{ - int i = 0, j = 0; +void init(double* A, double* B, double* C, int size) { + int i = 0, j = 0; - srand(2019); + srand(2019); - for(i = 0; i < size; i++) - { - for(j = 0; j < size; j++) - { - A[i * size + j] = rand(); - B[i * size + j] = rand(); - C[i * size + j] = 0.0; - } - } + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + C[i * size + j] = 0.0; + } + } } void mult(double* A, double* B, double* C, int size) diff --git a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu index 93a076bd84ef97f8c0313a9ad362fc8adaffdc86..68d35f3ab81bcb3a4cc3a1aef9e17e31fc0102b1 100755 --- a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu +++ b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu @@ -33,22 +33,18 @@ int verify_matrix(double *matRef, double *matOut, int N) { } return 0; } +void init(double* A, double* B, double* C, int size) { + int i = 0, j = 0; -void init(double* A, double* B, double* C, int size) -{ - int i = 0, j = 0; - - srand(2019); + srand(2019); - for(i = 0; i < size; i++) - { - for(j = 0; j < size; j++) - { - A[i * size + j] = rand(); - B[i * size + j] = rand(); - C[i * size + j] = 0.0; - } - } + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + C[i * size + j] = 0.0; + } + } } void mult(double* A, double* B, double* C, int size) diff --git a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu index 05ea42ff7ce0526bee4413902474e6fc8faa8b71..28b03b6924ef63c52a3f03344d88896cb02aef09 100755 --- a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu +++ b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu @@ -10,88 +10,85 @@ #define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) #define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) /** return time in second -*/ + */ double get_elapsedtime(void) { - struct timespec st; - int err = gettime(&st); - if (err !=0) return 0; - return (double)st.tv_sec + get_sub_seconde(st); + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); } int verify_matrix(double *matRef, double *matOut, int N) { - double diff = 0.0; - int i; - for (i = 0; i < N*N; i++) { - diff = fabs(matRef[i] - matOut[i]); - if (diff > 0.01) { - printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", - matRef[i], matOut[i], diff, i); - return 1; + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } } - } - return 0; + return 0; } -void init(double* A, double* B, double* C, int size) -{ - int i = 0, j = 0; +void init(double* A, double* B, double* C, int size) { + int i = 0, j = 0; - srand(2019); + srand(2019); - for(i = 0; i < size; i++) - { - for(j = 0; j < size; j++) - { - A[i * size + j] = rand(); - B[i * size + j] = rand(); - C[i * size + j] = 0.0; - } - } + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + C[i * size + j] = 0.0; + } + } } void mult(double* A, double* B, double* C, int size) { - int i = 0, j = 0, k = 0; + int i = 0, j = 0, k = 0; - for(i = 0; i < size; i++) - { - for(j = 0; j < size; j++) + for(i = 0; i < size; i++) { - double sum = 0.; - for(k = 0; k < size; k++) - { - sum += A[i * size + k] * B[k * size + j]; - } - C[i * size + j] = sum; - } - } + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } } -__global__ - void MulMatrixKernel(double* A, double* B, double* C, int N) - { - int col = threadIdx.x + blockDim.x * blockIdx.x; - int ligne = threadIdx.y + blockDim.y * blockIdx.y; + __global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ + int col = threadIdx.x + blockDim.x * blockIdx.x; + int ligne = threadIdx.y + blockDim.y * blockIdx.y; - if((col < N) && (ligne < N)) + if((col < N) && (ligne < N)) { - double val = 0.0f; - for(int k = 0; k < N; k++) - { - val += A[ligne * N + k] * B[k * N + col]; - } - C[ligne * N + col] = val; - } - } + double val = 0.0f; + for(int k = 0; k < N; k++) + { + val += A[ligne * N + k] * B[k * N + col]; + } + C[ligne * N + col] = val; + } +} __global__ - void MulMatrixShare(double* A, double* B, double* C, int N){ +void MulMatrixShare(double* A, double* B, double* C, int N){ // tuile en shared memory sous format row major // dimensions de la tuile = dimension du block de thread - __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH]; - __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH]; + __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH]; + __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH]; // indices des premières ligne et colonne calculées par le bloc de thread int blockRow = BLOCK_WIDTH * blockIdx.y; @@ -103,122 +100,122 @@ __global__ // indices de la valeur de C calculé par le thread - double value = 0.0f; + double value = 0.0f; - for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH) + for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH) { - A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */ - + threadRow) * N /* décalage à la ligne traitée par le thread */ - + tile_offset /* décalage à la tuile courante */ - + threadCol]; /* dcalage à la colonne traitée par le thread */ + A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */ + + threadRow) * N /* décalage à la ligne traitée par le thread */ + + tile_offset /* décalage à la tuile courante */ + + threadCol]; /* dcalage à la colonne traitée par le thread */ - B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */ - + threadRow * N /* décalage à la ligne du thread */ - + blockCol /* décalage à la colonne du bloc */ - + threadCol]; /* décalage à la colonne du thread */ + B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */ + + threadRow * N /* décalage à la ligne du thread */ + + blockCol /* décalage à la colonne du bloc */ + + threadCol]; /* décalage à la colonne du thread */ - // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices - __syncthreads(); + // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices + __syncthreads(); - for(int k =0; k < BLOCK_WIDTH; k++) - { - value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol]; - } + for(int k =0; k < BLOCK_WIDTH; k++) + { + value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol]; + } - // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile - __syncthreads(); - } + // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile + __syncthreads(); + } // Enregistrer la valeur accumulée dans C (mémoire globale) - C[(blockRow + threadRow) * N + blockCol + threadCol] = value; + C[(blockRow + threadRow) * N + blockCol + threadCol] = value; } int main(int argc, char** argv) { - int N, use_cpu; - - N = (argc < 2)?1000:atoi(argv[1]); - use_cpu = (argc < 3)?0:atoi(argv[2]); - - double t0 = 0., t1 = 0., duration = 0.; - - int nbBlocks = N / BLOCK_WIDTH; - //if(N % BLOCK_WIDTH) nbBlocks++; - if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH); - dim3 gridSize(nbBlocks, nbBlocks); - dim3 blockSize(BLOCK_WIDTH * BLOCK_WIDTH); - - double *A, *B, *C, *C_ref; - double *d_A, *d_B, *d_C; - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - A = (double*) malloc(sizeof(double) * N * N); - B = (double*) malloc(sizeof(double) * N * N); - C = (double*) malloc(sizeof(double) * N * N); - if (use_cpu) - C_ref = (double*) malloc(sizeof(double) * N * N); - - cudaMalloc(&d_A, sizeof(double) * N * N); - cudaMalloc(&d_B, sizeof(double) * N * N); - cudaMalloc(&d_C, sizeof(double) * N * N); - - // Value initialization - init(A, B, C, N); - - cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); - cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); - - cudaEventRecord(start); - //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); - MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); - cudaEventRecord(stop); - - cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); - - // Compute multiplication on cpu - if (use_cpu){ - t0 = get_elapsedtime(); - mult(A, B, C_ref, N); - t1 = get_elapsedtime(); - } - - // get gpu elapsed time - cudaEventSynchronize(stop); - float gpu_duration = 0; - cudaEventElapsedTime(&gpu_duration, start, stop); - cudaEventDestroy(start); - cudaEventDestroy(stop); - - // Pretty print - uint64_t N_64 = (uint64_t) N; - uint64_t nb_op = 2* N_64 * N_64 * N_64; - - if (use_cpu){ - duration = (t1 - t0); - fprintf(stdout, "Performance results: \n"); - fprintf(stdout, " Time: %lf s\n", duration); - fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); - - if (verify_matrix(C, C_ref, N) != 0){ - fprintf(stderr, "Wrong result !\n"); + int N, use_cpu; + + N = (argc < 2)?1000:atoi(argv[1]); + use_cpu = (argc < 3)?0:atoi(argv[2]); + + double t0 = 0., t1 = 0., duration = 0.; + + int nbBlocks = N / BLOCK_WIDTH; + //if(N % BLOCK_WIDTH) nbBlocks++; + if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH); + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH * BLOCK_WIDTH); + + double *A, *B, *C, *C_ref; + double *d_A, *d_B, *d_C; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + if (use_cpu) + C_ref = (double*) malloc(sizeof(double) * N * N); + + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + + cudaEventRecord(start); + //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + cudaEventRecord(stop); + + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + + // Compute multiplication on cpu + if (use_cpu){ + t0 = get_elapsedtime(); + mult(A, B, C_ref, N); + t1 = get_elapsedtime(); + } + + // get gpu elapsed time + cudaEventSynchronize(stop); + float gpu_duration = 0; + cudaEventElapsedTime(&gpu_duration, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Pretty print + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + + if (use_cpu){ + duration = (t1 - t0); + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + if (verify_matrix(C, C_ref, N) != 0){ + fprintf(stderr, "Wrong result !\n"); + } } - } - fprintf(stdout, "Performance results (GPU): \n"); - fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); - fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); + fprintf(stdout, "Performance results (GPU): \n"); + fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); + fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); - cudaFree(d_A); - cudaFree(d_B); - cudaFree(d_C); + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); - free(A); - free(B); - free(C); + free(A); + free(B); + free(C); - return 0; + return 0; } diff --git a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu index 519ae89cc7c58ea69d5fa29e021a3515836525be..bb9f2024484e4a56b5622b7e50c6d38704f157f8 100755 --- a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu +++ b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu @@ -10,88 +10,85 @@ #define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) #define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) /** return time in second -*/ + */ double get_elapsedtime(void) { - struct timespec st; - int err = gettime(&st); - if (err !=0) return 0; - return (double)st.tv_sec + get_sub_seconde(st); + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); } int verify_matrix(double *matRef, double *matOut, int N) { - double diff = 0.0; - int i; - for (i = 0; i < N*N; i++) { - diff = fabs(matRef[i] - matOut[i]); - if (diff > 0.01) { - printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", - matRef[i], matOut[i], diff, i); - return 1; + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } } - } - return 0; + return 0; } -void init(double* A, double* B, double* C, int size) -{ - int i = 0, j = 0; +void init(double* A, double* B, double* C, int size) { + int i = 0, j = 0; - srand(2019); + srand(2019); - for(i = 0; i < size; i++) - { - for(j = 0; j < size; j++) - { - A[i * size + j] = rand(); - B[i * size + j] = rand(); - C[i * size + j] = 0.0; - } - } + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + A[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + B[i * size + j] = (double)(rand() % 10) + 0.01 * (rand() % 5); + C[i * size + j] = 0.0; + } + } } void mult(double* A, double* B, double* C, int size) { - int i = 0, j = 0, k = 0; + int i = 0, j = 0, k = 0; - for(i = 0; i < size; i++) - { - for(j = 0; j < size; j++) + for(i = 0; i < size; i++) { - double sum = 0.; - for(k = 0; k < size; k++) - { - sum += A[i * size + k] * B[k * size + j]; - } - C[i * size + j] = sum; - } - } + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } } -__global__ - void MulMatrixKernel(double* A, double* B, double* C, int N) - { - int col = threadIdx.x + blockDim.x * blockIdx.x; - int ligne = threadIdx.y + blockDim.y * blockIdx.y; + __global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ + int col = threadIdx.x + blockDim.x * blockIdx.x; + int ligne = threadIdx.y + blockDim.y * blockIdx.y; - if((col < N) && (ligne < N)) + if((col < N) && (ligne < N)) { - double val = 0.0f; - for(int k = 0; k < N; k++) - { - val += A[ligne * N + k] * B[k * N + col]; - } - C[ligne * N + col] = val; - } - } + double val = 0.0f; + for(int k = 0; k < N; k++) + { + val += A[ligne * N + k] * B[k * N + col]; + } + C[ligne * N + col] = val; + } +} __global__ - void MulMatrixShare(double* A, double* B, double* C, int N){ +void MulMatrixShare(double* A, double* B, double* C, int N){ // tuile en shared memory sous format row major // dimensions de la tuile = dimension du block de thread - __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH]; - __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH]; + __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH]; + __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH]; // indices des premières ligne et colonne calculées par le bloc de thread int blockRow = BLOCK_WIDTH * blockIdx.y; @@ -103,122 +100,122 @@ __global__ // indices de la valeur de C calculé par le thread - double value = 0.0f; + double value = 0.0f; - for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH) + for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH) { - A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */ - + threadRow) * N /* décalage à la ligne traitée par le thread */ - + tile_offset /* décalage à la tuile courante */ - + threadCol]; /* dcalage à la colonne traitée par le thread */ + A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */ + + threadRow) * N /* décalage à la ligne traitée par le thread */ + + tile_offset /* décalage à la tuile courante */ + + threadCol]; /* dcalage à la colonne traitée par le thread */ - B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */ - + threadRow * N /* décalage à la ligne du thread */ - + blockCol /* décalage à la colonne du bloc */ - + threadCol]; /* décalage à la colonne du thread */ + B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */ + + threadRow * N /* décalage à la ligne du thread */ + + blockCol /* décalage à la colonne du bloc */ + + threadCol]; /* décalage à la colonne du thread */ - // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices - __syncthreads(); + // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices + __syncthreads(); - for(int k =0; k < BLOCK_WIDTH; k++) - { - value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol]; - } + for(int k =0; k < BLOCK_WIDTH; k++) + { + value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol]; + } - // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile - __syncthreads(); - } + // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile + __syncthreads(); + } // Enregistrer la valeur accumulée dans C (mémoire globale) - C[(blockRow + threadRow) * N + blockCol + threadCol] = value; + C[(blockRow + threadRow) * N + blockCol + threadCol] = value; } int main(int argc, char** argv) { - int N, use_cpu; - - N = (argc < 2)?1000:atoi(argv[1]); - use_cpu = (argc < 3)?0:atoi(argv[2]); - - double t0 = 0., t1 = 0., duration = 0.; - - int nbBlocks = N / BLOCK_WIDTH; - //if(N % BLOCK_WIDTH) nbBlocks++; - if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH); - dim3 gridSize(nbBlocks, nbBlocks); - dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); - - double *A, *B, *C, *C_ref; - double *d_A, *d_B, *d_C; - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - A = (double*) malloc(sizeof(double) * N * N); - B = (double*) malloc(sizeof(double) * N * N); - C = (double*) malloc(sizeof(double) * N * N); - if (use_cpu) - C_ref = (double*) malloc(sizeof(double) * N * N); - - cudaMalloc(&d_A, sizeof(double) * N * N); - cudaMalloc(&d_B, sizeof(double) * N * N); - cudaMalloc(&d_C, sizeof(double) * N * N); - - // Value initialization - init(A, B, C, N); - - cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); - cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); - - cudaEventRecord(start); - //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); - MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); - cudaEventRecord(stop); - - cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); - - // Compute multiplication on cpu - if (use_cpu){ - t0 = get_elapsedtime(); - mult(A, B, C_ref, N); - t1 = get_elapsedtime(); - } - - // get gpu elapsed time - cudaEventSynchronize(stop); - float gpu_duration = 0; - cudaEventElapsedTime(&gpu_duration, start, stop); - cudaEventDestroy(start); - cudaEventDestroy(stop); - - // Pretty print - uint64_t N_64 = (uint64_t) N; - uint64_t nb_op = 2* N_64 * N_64 * N_64; - - if (use_cpu){ - duration = (t1 - t0); - fprintf(stdout, "Performance results: \n"); - fprintf(stdout, " Time: %lf s\n", duration); - fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); - - if (verify_matrix(C, C_ref, N) != 0){ - fprintf(stderr, "Wrong result !\n"); + int N, use_cpu; + + N = (argc < 2)?1000:atoi(argv[1]); + use_cpu = (argc < 3)?0:atoi(argv[2]); + + double t0 = 0., t1 = 0., duration = 0.; + + int nbBlocks = N / BLOCK_WIDTH; + //if(N % BLOCK_WIDTH) nbBlocks++; + if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH); + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + + double *A, *B, *C, *C_ref; + double *d_A, *d_B, *d_C; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + if (use_cpu) + C_ref = (double*) malloc(sizeof(double) * N * N); + + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + + cudaEventRecord(start); + //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + cudaEventRecord(stop); + + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + + // Compute multiplication on cpu + if (use_cpu){ + t0 = get_elapsedtime(); + mult(A, B, C_ref, N); + t1 = get_elapsedtime(); + } + + // get gpu elapsed time + cudaEventSynchronize(stop); + float gpu_duration = 0; + cudaEventElapsedTime(&gpu_duration, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Pretty print + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + + if (use_cpu){ + duration = (t1 - t0); + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + if (verify_matrix(C, C_ref, N) != 0){ + fprintf(stderr, "Wrong result !\n"); + } } - } - fprintf(stdout, "Performance results (GPU): \n"); - fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); - fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); + fprintf(stdout, "Performance results (GPU): \n"); + fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); + fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); - cudaFree(d_A); - cudaFree(d_B); - cudaFree(d_C); + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); - free(A); - free(B); - free(C); + free(A); + free(B); + free(C); - return 0; + return 0; }