diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_1.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_1.cu new file mode 100755 index 0000000000000000000000000000000000000000..b9cec84d4d6483e803db185d1ebe8b7756b6d644 --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_1.cu @@ -0,0 +1,118 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> +#include <math.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second + */ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +int main(int argc, char** argv){ + int N = 0; + + double *A = NULL; + double *B = NULL; + double *C = NULL; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // Value initialization + init(A, B, C, N); + + // Compute multiplication + t0 = get_elapsedtime(); + mult(A, B, C, N); + t1 = get_elapsedtime(); + + // Pretty print + duration = (t1 - t0); + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + free(A); + free(B); + free(C); + + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_2.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_2.cu new file mode 100755 index 0000000000000000000000000000000000000000..da1924f3750ab0df90eacfb234e6b181499c602f --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_2.cu @@ -0,0 +1,123 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> +#include <math.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +int main(int argc, char** argv){ + int N, i; + + double *A; + double *B; + double *C; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // Value initialization + init(A, B, C, N); + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // Compute multiplication + t0 = get_elapsedtime(); + mult(A, B, C, N); + t1 = get_elapsedtime(); + + // Pretty print + duration = (t1 - t0); + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2 * N_64 * N_64 * N_64; + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + free(A); + free(B); + free(C); + + return 0;} diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_3.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_3.cu new file mode 100755 index 0000000000000000000000000000000000000000..3a4488c7b3ab4feddd3a063c070875c29395f2aa --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_3.cu @@ -0,0 +1,130 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> +#include <math.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +int main(int argc, char** argv){ + int N; + + double *A = NULL; + double *B = NULL; + double *C = NULL; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // QUESTION 3 + int nbBlocks = N / BLOCK_WIDTH; + if(N % BLOCK_WIDTH) nbBlocks++; + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + // FIN QUESTION 3 + + // Compute multiplication + t0 = get_elapsedtime(); + mult(A, B, C, N); + t1 = get_elapsedtime(); + + // Pretty print + duration = (t1 - t0); + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + free(A); + free(B); + free(C); + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_4.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_4.cu new file mode 100755 index 0000000000000000000000000000000000000000..488f4af800902b019edbc8179b7e8febee6e0063 --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_4.cu @@ -0,0 +1,138 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> +#include <math.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +// QUESTION 4 +__global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ +} +// FIN QUESTION 4 + +int main(int argc, char** argv){ + int N, i; + + double *A = NULL; + double *B = NULL; + double *C = NULL; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // QUESTION 3 + int nbBlocks = N / BLOCK_WIDTH; + if(N % BLOCK_WIDTH) nbBlocks++; + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + // FIN QUESTION 3 + + // QUESTION 4 + MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + // FIN QUESTION 4 + + + // Compute multiplication + t0 = get_elapsedtime(); + mult(A, B, C, N); + t1 = get_elapsedtime(); + + // Pretty print + duration = (t1 - t0); + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_5.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_5.cu new file mode 100755 index 0000000000000000000000000000000000000000..406bb4fb79ed01b82cf0fc0d598369f6b4031dff --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_5.cu @@ -0,0 +1,144 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> +#include <math.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +// QUESTION 4 +__global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ +} +// FIN QUESTION 4 + +int main(int argc, char** argv){ + int N, i; + + double *A; + double *B; + double *C; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // QUESTION 3 + int nbBlocks = N / BLOCK_WIDTH; + if(N % BLOCK_WIDTH) nbBlocks++; + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + // FIN QUESTION 3 + + // QUESTION 4 + MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + // FIN QUESTION 4 + + // QUESTION 5 + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + // FIN QUESTION 5 + + // Compute multiplication + t0 = get_elapsedtime(); + mult(A, B, C, N); + t1 = get_elapsedtime(); + + // Pretty print + duration = (t1 - t0); + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + free(A); + free(B); + free(C); + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_6.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_6.cu new file mode 100755 index 0000000000000000000000000000000000000000..b8b9cefe53c762327a1ee1926126aa062010c59f --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_6.cu @@ -0,0 +1,149 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second + */ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +// QUESTION 4 + __global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ + // QUESTION 6 + int line = threadIdx.x + blockDim.x * blockIdx.x; + int col = threadIdx.y + blockDim.y * blockIdx.y; + // FIN QUESTION 6 +} +// FIN QUESTION 4 + +int main(int argc, char** argv){ + int N; + + double *A; + double *B; + double *C; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // QUESTION 3 + int nbBlocks = N / BLOCK_WIDTH; + if(N % BLOCK_WIDTH) nbBlocks++; + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + // FIN QUESTION 3 + + // QUESTION 4 + MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + // FIN QUESTION 4 + + // QUESTION 5 + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + // FIN QUESTION 5 + + // Compute multiplication + t0 = get_elapsedtime(); + mult(A, B, C, N); + t1 = get_elapsedtime(); + + // Pretty print + duration = (t1 - t0); + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + free(A); + free(B); + free(C); + + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu b/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu new file mode 100755 index 0000000000000000000000000000000000000000..7e6abed0a36d1032a6a2f178ba302e4f4bfa69f0 --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie2/dgemm_7.cu @@ -0,0 +1,197 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +// QUESTION 4 +__global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ + // QUESTION 6 + int line = threadIdx.x + blockDim.x * blockIdx.x; + int col = threadIdx.y + blockDim.y * blockIdx.y; + // FIN QUESTION 6 + + // QUESTION 7 + if((col < N) && (line < N)) + { + double val = 0.0f; + for(int k = 0; k < N; k++) + { + val += A[line * N + k] * B[k * N + col]; + } + C[line * N + col] = val; + } + // FIN QUESTION 7 +} +// FIN QUESTION 4 + +int main(int argc, char** argv){ + int N; + int use_cpu; + + double *A; + double *B; + double *C; + double *C_ref; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + use_cpu = (argc < 3)?0:atoi(argv[2]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A= (double*) malloc(sizeof(double) * N * N); + B= (double*) malloc(sizeof(double) * N * N); + C= (double*) malloc(sizeof(double) * N * N); + if (use_cpu) + C_ref = (double*) malloc(sizeof(double) * N * N); + + // timers + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Value initialization + init(A, B, C, N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // QUESTION 3 + int nbBlocks = N / BLOCK_WIDTH; + if(N % BLOCK_WIDTH) nbBlocks++; + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + // FIN QUESTION 3 + + // QUESTION 4 + cudaEventRecord(start); + MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + cudaEventRecord(stop); + // FIN QUESTION 4 + + // QUESTION 5 + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + // FIN QUESTION 5 + + // Compute multiplication + if (use_cpu){ + t0 = get_elapsedtime(); + mult(A, B, C_ref, N); + t1 = get_elapsedtime(); + } + + // get gpu elapsed time + cudaEventSynchronize(stop); + float gpu_duration = 0; + cudaEventElapsedTime(&gpu_duration, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Pretty print + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + + if (use_cpu){ + duration = (t1 - t0); + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + if (verify_matrix(C, C_ref, N) != 0){ + fprintf(stderr, "Wrong result !\n"); + } + } + fprintf(stdout, "Performance results (GPU): \n"); + fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); + fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); + + + free(A); + free(B); + free(C); + + cudaFree(A); + cudaFree(B); + cudaFree(C); + + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu new file mode 100755 index 0000000000000000000000000000000000000000..459789977f5d39de2aba3e4bbd9990cf23e54a2a --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_1D.cu @@ -0,0 +1,197 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +// QUESTION 4 +__global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ + // QUESTION 6 + int line = threadIdx.x / BLOCK_WIDTH + BLOCK_WIDTH * blockIdx.x; + int col = threadIdx.y % BLOCK_WIDTH + BLOCK_WIDTH * blockIdx.y; + // FIN QUESTION 6 + + // QUESTION 7 + if((col < N) && (line < N)) + { + double val = 0.0f; + for(int k = 0; k < N; k++) + { + val += A[line * N + k] * B[k * N + col]; + } + C[line * N + col] = val; + } + // FIN QUESTION 7 +} +// FIN QUESTION 4 + +int main(int argc, char** argv){ + int N; + int use_cpu; + + double *A; + double *B; + double *C; + double *C_ref; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + use_cpu = (argc < 3)?0:atoi(argv[2]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A= (double*) malloc(sizeof(double) * N * N); + B= (double*) malloc(sizeof(double) * N * N); + C= (double*) malloc(sizeof(double) * N * N); + if (use_cpu) + C_ref = (double*) malloc(sizeof(double) * N * N); + + // timers + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Value initialization + init(A, B, C, N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // QUESTION 3 + int nbBlocks = N / BLOCK_WIDTH; + if(N % BLOCK_WIDTH) nbBlocks++; + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH * BLOCK_WIDTH); + // FIN QUESTION 3 + + // QUESTION 4 + cudaEventRecord(start); + MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + cudaEventRecord(stop); + // FIN QUESTION 4 + + // QUESTION 5 + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + // FIN QUESTION 5 + + // Compute multiplication + if (use_cpu){ + t0 = get_elapsedtime(); + mult(A, B, C_ref, N); + t1 = get_elapsedtime(); + } + + // get gpu elapsed time + cudaEventSynchronize(stop); + float gpu_duration = 0; + cudaEventElapsedTime(&gpu_duration, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Pretty print + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + + if (use_cpu){ + duration = (t1 - t0); + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + if (verify_matrix(C, C_ref, N) != 0){ + fprintf(stderr, "Wrong result !\n"); + } + } + fprintf(stdout, "Performance results (GPU): \n"); + fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); + fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); + + + free(A); + free(B); + free(C); + + cudaFree(A); + cudaFree(B); + cudaFree(C); + + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu new file mode 100755 index 0000000000000000000000000000000000000000..93a076bd84ef97f8c0313a9ad362fc8adaffdc86 --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie3/dgemm_coalescing_2D.cu @@ -0,0 +1,197 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> + +#include <cuda.h> +#include <cuda_runtime.h> +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +// QUESTION 4 +__global__ +void MulMatrixKernel(double* A, double* B, double* C, int N) +{ + // QUESTION 6 + int col = threadIdx.x + blockDim.x * blockIdx.x; + int line = threadIdx.y + blockDim.y * blockIdx.y; + // FIN QUESTION 6 + + // QUESTION 7 + if((col < N) && (line < N)) + { + double val = 0.0f; + for(int k = 0; k < N; k++) + { + val += A[line * N + k] * B[k * N + col]; + } + C[line * N + col] = val; + } + // FIN QUESTION 7 +} +// FIN QUESTION 4 + +int main(int argc, char** argv){ + int N; + int use_cpu; + + double *A; + double *B; + double *C; + double *C_ref; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2)?1000:atoi(argv[1]); + use_cpu = (argc < 3)?0:atoi(argv[2]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A= (double*) malloc(sizeof(double) * N * N); + B= (double*) malloc(sizeof(double) * N * N); + C= (double*) malloc(sizeof(double) * N * N); + if (use_cpu) + C_ref = (double*) malloc(sizeof(double) * N * N); + + // timers + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Value initialization + init(A, B, C, N); + + // QUESTION 1 + double *d_A, *d_B, *d_C; + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + // FIN QUESTION 1 + + // QUESTION 2 + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + // FIN QUESTION 2 + + // QUESTION 3 + int nbBlocks = N / BLOCK_WIDTH; + if(N % BLOCK_WIDTH) nbBlocks++; + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + // FIN QUESTION 3 + + // QUESTION 4 + cudaEventRecord(start); + MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + cudaEventRecord(stop); + // FIN QUESTION 4 + + // QUESTION 5 + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + // FIN QUESTION 5 + + // Compute multiplication + if (use_cpu){ + t0 = get_elapsedtime(); + mult(A, B, C_ref, N); + t1 = get_elapsedtime(); + } + + // get gpu elapsed time + cudaEventSynchronize(stop); + float gpu_duration = 0; + cudaEventElapsedTime(&gpu_duration, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Pretty print + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + + if (use_cpu){ + duration = (t1 - t0); + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + if (verify_matrix(C, C_ref, N) != 0){ + fprintf(stderr, "Wrong result !\n"); + } + } + fprintf(stdout, "Performance results (GPU): \n"); + fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); + fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); + + + free(A); + free(B); + free(C); + + cudaFree(A); + cudaFree(B); + cudaFree(C); + + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu new file mode 100755 index 0000000000000000000000000000000000000000..05ea42ff7ce0526bee4413902474e6fc8faa8b71 --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_1D.cu @@ -0,0 +1,224 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <math.h> + +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +__global__ + void MulMatrixKernel(double* A, double* B, double* C, int N) + { + int col = threadIdx.x + blockDim.x * blockIdx.x; + int ligne = threadIdx.y + blockDim.y * blockIdx.y; + + if((col < N) && (ligne < N)) + { + double val = 0.0f; + for(int k = 0; k < N; k++) + { + val += A[ligne * N + k] * B[k * N + col]; + } + C[ligne * N + col] = val; + } + } + +__global__ + void MulMatrixShare(double* A, double* B, double* C, int N){ + + // tuile en shared memory sous format row major + // dimensions de la tuile = dimension du block de thread + __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH]; + __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH]; + + // indices des premières ligne et colonne calculées par le bloc de thread + int blockRow = BLOCK_WIDTH * blockIdx.y; + int blockCol = BLOCK_WIDTH * blockIdx.x; + + // indices locales à la tuile + int threadCol = threadIdx.x % BLOCK_WIDTH; + int threadRow = threadIdx.x % BLOCK_WIDTH; + + // indices de la valeur de C calculé par le thread + + double value = 0.0f; + + for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH) + { + + A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */ + + threadRow) * N /* décalage à la ligne traitée par le thread */ + + tile_offset /* décalage à la tuile courante */ + + threadCol]; /* dcalage à la colonne traitée par le thread */ + + B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */ + + threadRow * N /* décalage à la ligne du thread */ + + blockCol /* décalage à la colonne du bloc */ + + threadCol]; /* décalage à la colonne du thread */ + + // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices + __syncthreads(); + + for(int k =0; k < BLOCK_WIDTH; k++) + { + value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol]; + } + + // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile + __syncthreads(); + } + + // Enregistrer la valeur accumulée dans C (mémoire globale) + C[(blockRow + threadRow) * N + blockCol + threadCol] = value; +} + +int main(int argc, char** argv) +{ + int N, use_cpu; + + N = (argc < 2)?1000:atoi(argv[1]); + use_cpu = (argc < 3)?0:atoi(argv[2]); + + double t0 = 0., t1 = 0., duration = 0.; + + int nbBlocks = N / BLOCK_WIDTH; + //if(N % BLOCK_WIDTH) nbBlocks++; + if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH); + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH * BLOCK_WIDTH); + + double *A, *B, *C, *C_ref; + double *d_A, *d_B, *d_C; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + if (use_cpu) + C_ref = (double*) malloc(sizeof(double) * N * N); + + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + + cudaEventRecord(start); + //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + cudaEventRecord(stop); + + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + + // Compute multiplication on cpu + if (use_cpu){ + t0 = get_elapsedtime(); + mult(A, B, C_ref, N); + t1 = get_elapsedtime(); + } + + // get gpu elapsed time + cudaEventSynchronize(stop); + float gpu_duration = 0; + cudaEventElapsedTime(&gpu_duration, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Pretty print + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + + if (use_cpu){ + duration = (t1 - t0); + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + if (verify_matrix(C, C_ref, N) != 0){ + fprintf(stderr, "Wrong result !\n"); + } + } + fprintf(stdout, "Performance results (GPU): \n"); + fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); + fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); + + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + free(A); + free(B); + free(C); + + return 0; +} diff --git a/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu new file mode 100755 index 0000000000000000000000000000000000000000..519ae89cc7c58ea69d5fa29e021a3515836525be --- /dev/null +++ b/TPs/TP1/CORRECTION/Partie4/dgemm_shared_2D.cu @@ -0,0 +1,224 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <math.h> + +#define BLOCK_WIDTH 32 +#define TAILLE 4096 + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + int i; + for (i = 0; i < N*N; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = rand(); + B[i * size + j] = rand(); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +__global__ + void MulMatrixKernel(double* A, double* B, double* C, int N) + { + int col = threadIdx.x + blockDim.x * blockIdx.x; + int ligne = threadIdx.y + blockDim.y * blockIdx.y; + + if((col < N) && (ligne < N)) + { + double val = 0.0f; + for(int k = 0; k < N; k++) + { + val += A[ligne * N + k] * B[k * N + col]; + } + C[ligne * N + col] = val; + } + } + +__global__ + void MulMatrixShare(double* A, double* B, double* C, int N){ + + // tuile en shared memory sous format row major + // dimensions de la tuile = dimension du block de thread + __shared__ double A_s[BLOCK_WIDTH * BLOCK_WIDTH]; + __shared__ double B_s[BLOCK_WIDTH * BLOCK_WIDTH]; + + // indices des premières ligne et colonne calculées par le bloc de thread + int blockRow = BLOCK_WIDTH * blockIdx.y; + int blockCol = BLOCK_WIDTH * blockIdx.x; + + // indices locales à la tuile + int threadCol = threadIdx.x; + int threadRow = threadIdx.y; + + // indices de la valeur de C calculé par le thread + + double value = 0.0f; + + for(int tile_offset = 0; tile_offset < N; tile_offset+=BLOCK_WIDTH) + { + + A_s[BLOCK_WIDTH * threadRow + threadCol] = A[(blockRow /* première ligne traitée par le bloc de thread */ + + threadRow) * N /* décalage à la ligne traitée par le thread */ + + tile_offset /* décalage à la tuile courante */ + + threadCol]; /* dcalage à la colonne traitée par le thread */ + + B_s[BLOCK_WIDTH * threadRow + threadCol] = B[tile_offset * N /* décalage à la ligne tuile courante */ + + threadRow * N /* décalage à la ligne du thread */ + + blockCol /* décalage à la colonne du bloc */ + + threadCol]; /* décalage à la colonne du thread */ + + // Attente que tous les threads ont bien chargé dans la mémoire partagée leurs deux indices + __syncthreads(); + + for(int k =0; k < BLOCK_WIDTH; k++) + { + value += A_s[threadRow * BLOCK_WIDTH + k] * B_s[k * BLOCK_WIDTH + threadCol]; + } + + // S'assurer que tous les threads ont bien fini le calcul du préliminaire du tile courant avant de commencer la prochaine étape du calcul de cette tile + __syncthreads(); + } + + // Enregistrer la valeur accumulée dans C (mémoire globale) + C[(blockRow + threadRow) * N + blockCol + threadCol] = value; +} + +int main(int argc, char** argv) +{ + int N, use_cpu; + + N = (argc < 2)?1000:atoi(argv[1]); + use_cpu = (argc < 3)?0:atoi(argv[2]); + + double t0 = 0., t1 = 0., duration = 0.; + + int nbBlocks = N / BLOCK_WIDTH; + //if(N % BLOCK_WIDTH) nbBlocks++; + if(N % BLOCK_WIDTH) N += (N % BLOCK_WIDTH); + dim3 gridSize(nbBlocks, nbBlocks); + dim3 blockSize(BLOCK_WIDTH, BLOCK_WIDTH); + + double *A, *B, *C, *C_ref; + double *d_A, *d_B, *d_C; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + A = (double*) malloc(sizeof(double) * N * N); + B = (double*) malloc(sizeof(double) * N * N); + C = (double*) malloc(sizeof(double) * N * N); + if (use_cpu) + C_ref = (double*) malloc(sizeof(double) * N * N); + + cudaMalloc(&d_A, sizeof(double) * N * N); + cudaMalloc(&d_B, sizeof(double) * N * N); + cudaMalloc(&d_C, sizeof(double) * N * N); + + // Value initialization + init(A, B, C, N); + + cudaMemcpy(d_A, A, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(double) * N * N, cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, sizeof(double) * N * N, cudaMemcpyHostToDevice); + + cudaEventRecord(start); + //MulMatrixKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + MulMatrixShare<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); + cudaEventRecord(stop); + + cudaMemcpy(C, d_C, sizeof(double) * N * N, cudaMemcpyDeviceToHost); + + // Compute multiplication on cpu + if (use_cpu){ + t0 = get_elapsedtime(); + mult(A, B, C_ref, N); + t1 = get_elapsedtime(); + } + + // get gpu elapsed time + cudaEventSynchronize(stop); + float gpu_duration = 0; + cudaEventElapsedTime(&gpu_duration, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Pretty print + uint64_t N_64 = (uint64_t) N; + uint64_t nb_op = 2* N_64 * N_64 * N_64; + + if (use_cpu){ + duration = (t1 - t0); + fprintf(stdout, "Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + if (verify_matrix(C, C_ref, N) != 0){ + fprintf(stderr, "Wrong result !\n"); + } + } + fprintf(stdout, "Performance results (GPU): \n"); + fprintf(stdout, " Time: %lf s\n", gpu_duration*1E-3); + fprintf(stdout, " GFlops: %.2f\n", (nb_op / gpu_duration)*1E-6); + + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + free(A); + free(B); + free(C); + + return 0; +} diff --git a/TPs/TP1/SUJET/tp1.pdf b/TPs/TP1/SUJET/tp1.pdf index 68e14831f27b7a20e1351e4346c717d606af12ee..e68da3d37adb9e0938be007ea7b2b1708cf92e37 100644 Binary files a/TPs/TP1/SUJET/tp1.pdf and b/TPs/TP1/SUJET/tp1.pdf differ