diff --git a/TPs/TP1/CODE/Partie2/Makefile b/TPs/TP1/CODE/Partie2/Makefile index 4ae22712a5d3de26583073d9da7dd646cc80b99c..e34c97c2b64e4a3d9d15d1b7791002bcad7197cd 100755 --- a/TPs/TP1/CODE/Partie2/Makefile +++ b/TPs/TP1/CODE/Partie2/Makefile @@ -1,20 +1,32 @@ CC=gcc -CFLAGS=-O3 -Wall -EXE=mult.exe +NVCC=nvcc +CFLAGS=-O3 -Wall -Wextra +NVFLAGS=-O3 -I . +TARGET=mult_cpu +TARGETGPU=mult_gpu -all : $(EXE) +all: $(TARGET) $(TARGETGPU) -$(EXE) : dgemm.o - $(CC) $(CFLAGS) -o $@ $< +$(TARGET): dgemm_cpu.o + $(CC) $(CFLAGS) -o $@ $< -%.o : %.c +$(TARGETGPU): dgemm_gpu.o + $(NVCC) $(NVFLAGS) -o $@ $< + +%_cpu.o: %.c $(CC) $(CFLAGS) -c -o $@ $< -run : mult.exe - ./$< 1000 +%_gpu.o: %.cu + $(NVCC) $(NVFLAGS) -c -o $@ $< + +run_cpu: $(TARGET) + ./$(TARGET) 2000 + +run_gpu: $(TARGET) + ./$(TARGETGPU) 1024 proper : rm -f *.o clean : - rm -f *.o $(EXE) + rm -f *.o $(TARGET) $(TARGETGPU) diff --git a/TPs/TP1/CODE/Partie2/dgemm.c b/TPs/TP1/CODE/Partie2/dgemm.c index 90f051ebb7862ce96116ed4488b3836a2efaf480..c5486c130debcba16508d3d1af7cb5c1a374f536 100755 --- a/TPs/TP1/CODE/Partie2/dgemm.c +++ b/TPs/TP1/CODE/Partie2/dgemm.c @@ -23,7 +23,7 @@ int verify_matrix(double *matRef, double *matOut, int N) { for (i = 0; i < size; i++) { diff = fabs(matRef[i] - matOut[i]); if (diff > 0.01) { - printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n", + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %ld\n", matRef[i], matOut[i], diff, i); return 1; } diff --git a/TPs/TP1/CODE/Partie2/dgemm.cu b/TPs/TP1/CODE/Partie2/dgemm.cu new file mode 100755 index 0000000000000000000000000000000000000000..eaea2550af035b224b79e5f74b6ea30e88239446 --- /dev/null +++ b/TPs/TP1/CODE/Partie2/dgemm.cu @@ -0,0 +1,243 @@ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <inttypes.h> +#include <math.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) + +#include "helper_cuda.h" + +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +int verify_matrix(double *matRef, double *matOut, int N) { + double diff = 0.0; + uint64_t i; + uint64_t size = N*N; + for (i = 0; i < size; i++) { + diff = fabs(matRef[i] - matOut[i]); + if (diff > 0.01) { + printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %ld\n", + matRef[i], matOut[i], diff, i); + return 1; + } + } + return 0; +} + + +void init(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0; + + srand(2019); + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + A[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5); + B[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5); + C[i * size + j] = 0.0; + } + } +} + +void mult(double* A, double* B, double* C, int size) +{ + int i = 0, j = 0, k = 0; + + for(i = 0; i < size; i++) + { + for(j = 0; j < size; j++) + { + double sum = 0.; + for(k = 0; k < size; k++) + { + sum += A[i * size + k] * B[k * size + j]; + } + C[i * size + j] = sum; + } + } +} + +__global__ void multk(double *A, double *B, double *C, int N) +{ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= N || j >= N) + return; + + double sum = 0.; + for (int k = 0; k < N; ++k) + { + sum += A[i * N + k] * B[k * N + j]; + } + C[i * N + j] = sum; +} + +__global__ void multk2(double *A, double *B, double *C, int N) +{ + const int i = blockIdx.y * blockDim.y + threadIdx.y; + const int j = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N || j >= N) + return; + + double sum = 0.; + for (int k = 0; k < N; ++k) + { + sum += A[i * N + k] * B[k * N + j]; + } + C[i * N + j] = sum; +} + +__global__ void multk3(double *A, double *B, double *C, int N) +{ + const int blockSize = 32; + __shared__ double A_s[32 * 32]; + __shared__ double B_s[32 * 32]; + + const int i = blockIdx.y * blockSize + threadIdx.y; + const int j = blockIdx.x * blockSize + threadIdx.x; + + if (i >= N || j >= N) + return; + + double sum = 0; + for (int z = 0; z < N; z += blockSize) + { + int k = z + threadIdx.x; + int l = z + threadIdx.y; + if (k < N) + A_s[threadIdx.y * blockDim.x + threadIdx.x] = A[i * N + k]; + if (l < N) + B_s[threadIdx.y * blockDim.x + threadIdx.x] = B[l * N + j]; + __syncthreads(); + + for (int m = 0; m < blockSize; ++m){ + if (z + m < N) + sum += A_s[threadIdx.y * blockSize + m] + * B_s[m * blockSize + threadIdx.x]; + } + __syncthreads(); + } + + C[i * N + j] = sum; +} + +int main(int argc, char** argv){ + int N = 0; + + double *A_h = NULL; + double *B_h = NULL; + double *C_h = NULL; + double *C_cpu = NULL; + + double *A_d = NULL; + double *B_d = NULL; + double *C_d = NULL; + + double t0 = 0., t1 = 0., duration = 0.; + + N = (argc < 2) ? 1000 : atoi(argv[1]); + fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); + + // Memory allocation + A_h = (double*) malloc(sizeof(double) * N * N); + B_h = (double*) malloc(sizeof(double) * N * N); + C_h = (double*) malloc(sizeof(double) * N * N); + C_cpu = (double*) malloc(sizeof(double) * N * N); + + checkCudaErrors(cudaMalloc((void **) &A_d, sizeof(double) * N * N)); + checkCudaErrors(cudaMalloc((void **) &B_d, sizeof(double) * N * N)); + checkCudaErrors(cudaMalloc((void **) &C_d, sizeof(double) * N * N)); + + // Value initialization + init(A_h, B_h, C_h, N); + checkCudaErrors(cudaMemcpy(A_d, A_h, + sizeof(double) * N * N, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(B_d, B_h, + sizeof(double) * N * N, cudaMemcpyHostToDevice)); + + // Compute multiplication + + t0 = get_elapsedtime(); + mult(A_h, B_h, C_cpu, N); + t1 = get_elapsedtime(); + + cudaEvent_t ct[6]; + + for(int i = 0; i < 6; ++i) + cudaEventCreate(&ct[i]); + + float cudaDuration[3]; + + int blockSize = 32; + dim3 dimBlock(blockSize, blockSize, 1); + dim3 dimGrid((N / blockSize) + 1, (N / blockSize) + 1, 1); + //int sharedMem = blockSize * blockSize * 2 * sizeof(double); + + cudaEventRecord(ct[0]); + multk<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N); + cudaEventRecord(ct[1]); + cudaDeviceSynchronize(); + getLastCudaError("Mult kernel 1 failed"); + + + cudaEventRecord(ct[2]); + multk2<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N); + cudaEventRecord(ct[3]); + cudaDeviceSynchronize(); + getLastCudaError("Mult kernel 2 failed"); + + + cudaEventRecord(ct[4]); + //multk3<<<dimGrid, dimBlock, sharedMem>>>(A_d, B_d, C_d, N); + multk3<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N); + cudaEventRecord(ct[5]); + cudaDeviceSynchronize(); + getLastCudaError("Mult kernel 3 failed"); + + + + cudaEventElapsedTime(&cudaDuration[0], ct[0], ct[1]); + cudaEventElapsedTime(&cudaDuration[1], ct[2], ct[3]); + cudaEventElapsedTime(&cudaDuration[2], ct[4], ct[5]); + + checkCudaErrors(cudaMemcpy(C_h, C_d, + sizeof(double) * N * N, cudaMemcpyDeviceToHost)); + + verify_matrix(C_cpu, C_h, N); + + // Pretty print + duration = (t1 - t0); + uint64_t N_u64 = (uint64_t) N; + uint64_t nb_op = N_u64 * N_u64 * N_u64; + + fprintf(stdout, "CPU Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); + + fprintf(stdout, "GPU Performance results 1: \n"); + fprintf(stdout, " Time: %lf s\n", cudaDuration[0] / 1000); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[0]) * 1E-3); + + fprintf(stdout, "GPU Performance results 2: \n"); + fprintf(stdout, " Time: %lf s\n", cudaDuration[1] / 1000); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[1]) * 1E-3); + + fprintf(stdout, "GPU Performance results 3: \n"); + fprintf(stdout, " Time: %lf s\n", cudaDuration[2] / 1000); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[2]) * 1E-3); + + return 0; +} diff --git a/TPs/TP1/CODE/Partie2/helper_cuda.h b/TPs/TP1/CODE/Partie2/helper_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..3dd446dced85de9d10eb9f1597c56099fa358bdb --- /dev/null +++ b/TPs/TP1/CODE/Partie2/helper_cuda.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +#ifndef MIN +#define MIN(a, b) (a < b ? a : b) +#endif + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +void check(cudaError_t result, char const *const func, const char *const file, + int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, (int)(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// Beginning of GPU Architecture definitions +int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char *_ConvertSMVer2ArchName(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char *name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) + { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} +// end of GPU Architecture definitions + +// end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ \ No newline at end of file diff --git a/TPs/TP1/tp1.md b/TPs/TP1/tp1.md new file mode 100644 index 0000000000000000000000000000000000000000..e27657708436082c72fbed38e2f9ef412ac35c19 --- /dev/null +++ b/TPs/TP1/tp1.md @@ -0,0 +1,37 @@ + +Q.13 + +Op = 4096 * 4096 * 4096 = 68719476736 = 69 Gfp32 +Vr = 4096 * 4096 * 2 * 4 = 134217728 = 134 Mo +Vw = 4096 * 4096 * 4 = 67108864 = 67 Mo + + + +mem théorique GPU: 56 Go/s +flop théorique: 2273 Gfp32 + +V = Vr + Vw = 134 + 67 = 201 Mo +Tdata = 201 / 56 = 3.589 ms +Tcompute = 68 / 2.273 = 29.92 ms +rho = 29.92 / 3.59 ) 8.334 + +Le plus gros du temps est passer sur les calculs. + +Q.14 + +Performance réel: 9769.44 Mfp32 + +9.76944 / 2273 = 0.0043 = 0.43 % d'utilisation maximal des performances de calcules. + +Q.15 + +Op = 4096 * 4096 * 4096 = 68719476736 = 69 Gfp32 +Vr = 4096 * 4096 * 4096 * 2 * 4 = 549755813888 = 550 Go +Vw = 4096 * 4096 * 4 = 67108864 = 67 Mo + +Le trafique mémoire au pire est de 550 Go. + + + + +