Skip to content
Extraits de code Groupes Projets
Valider 876c3c4e rédigé par Nicolas MARIE's avatar Nicolas MARIE
Parcourir les fichiers

done TP1

parent 9e24fea4
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
CC=gcc
CFLAGS=-O3 -Wall
EXE=mult.exe
NVCC=nvcc
CFLAGS=-O3 -Wall -Wextra
NVFLAGS=-O3 -I .
TARGET=mult_cpu
TARGETGPU=mult_gpu
all : $(EXE)
all: $(TARGET) $(TARGETGPU)
$(EXE) : dgemm.o
$(TARGET): dgemm_cpu.o
$(CC) $(CFLAGS) -o $@ $<
%.o : %.c
$(TARGETGPU): dgemm_gpu.o
$(NVCC) $(NVFLAGS) -o $@ $<
%_cpu.o: %.c
$(CC) $(CFLAGS) -c -o $@ $<
run : mult.exe
./$< 1000
%_gpu.o: %.cu
$(NVCC) $(NVFLAGS) -c -o $@ $<
run_cpu: $(TARGET)
./$(TARGET) 2000
run_gpu: $(TARGET)
./$(TARGETGPU) 1024
proper :
rm -f *.o
clean :
rm -f *.o $(EXE)
rm -f *.o $(TARGET) $(TARGETGPU)
......@@ -23,7 +23,7 @@ int verify_matrix(double *matRef, double *matOut, int N) {
for (i = 0; i < size; i++) {
diff = fabs(matRef[i] - matOut[i]);
if (diff > 0.01) {
printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %d\n",
printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %ld\n",
matRef[i], matOut[i], diff, i);
return 1;
}
......
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <inttypes.h>
#include <math.h>
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
#include "helper_cuda.h"
/** return time in second
*/
double get_elapsedtime(void)
{
struct timespec st;
int err = gettime(&st);
if (err !=0) return 0;
return (double)st.tv_sec + get_sub_seconde(st);
}
int verify_matrix(double *matRef, double *matOut, int N) {
double diff = 0.0;
uint64_t i;
uint64_t size = N*N;
for (i = 0; i < size; i++) {
diff = fabs(matRef[i] - matOut[i]);
if (diff > 0.01) {
printf("Divergence! Should %5.2f, Is %5.2f (Diff %5.2f) at %ld\n",
matRef[i], matOut[i], diff, i);
return 1;
}
}
return 0;
}
void init(double* A, double* B, double* C, int size)
{
int i = 0, j = 0;
srand(2019);
for(i = 0; i < size; i++)
{
for(j = 0; j < size; j++)
{
A[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5);
B[i * size + j] = (double) (rand() % 10) + 0.01 * (rand() % 5);
C[i * size + j] = 0.0;
}
}
}
void mult(double* A, double* B, double* C, int size)
{
int i = 0, j = 0, k = 0;
for(i = 0; i < size; i++)
{
for(j = 0; j < size; j++)
{
double sum = 0.;
for(k = 0; k < size; k++)
{
sum += A[i * size + k] * B[k * size + j];
}
C[i * size + j] = sum;
}
}
}
__global__ void multk(double *A, double *B, double *C, int N)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x;
const int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= N || j >= N)
return;
double sum = 0.;
for (int k = 0; k < N; ++k)
{
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
__global__ void multk2(double *A, double *B, double *C, int N)
{
const int i = blockIdx.y * blockDim.y + threadIdx.y;
const int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= N || j >= N)
return;
double sum = 0.;
for (int k = 0; k < N; ++k)
{
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
__global__ void multk3(double *A, double *B, double *C, int N)
{
const int blockSize = 32;
__shared__ double A_s[32 * 32];
__shared__ double B_s[32 * 32];
const int i = blockIdx.y * blockSize + threadIdx.y;
const int j = blockIdx.x * blockSize + threadIdx.x;
if (i >= N || j >= N)
return;
double sum = 0;
for (int z = 0; z < N; z += blockSize)
{
int k = z + threadIdx.x;
int l = z + threadIdx.y;
if (k < N)
A_s[threadIdx.y * blockDim.x + threadIdx.x] = A[i * N + k];
if (l < N)
B_s[threadIdx.y * blockDim.x + threadIdx.x] = B[l * N + j];
__syncthreads();
for (int m = 0; m < blockSize; ++m){
if (z + m < N)
sum += A_s[threadIdx.y * blockSize + m]
* B_s[m * blockSize + threadIdx.x];
}
__syncthreads();
}
C[i * N + j] = sum;
}
int main(int argc, char** argv){
int N = 0;
double *A_h = NULL;
double *B_h = NULL;
double *C_h = NULL;
double *C_cpu = NULL;
double *A_d = NULL;
double *B_d = NULL;
double *C_d = NULL;
double t0 = 0., t1 = 0., duration = 0.;
N = (argc < 2) ? 1000 : atoi(argv[1]);
fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N);
// Memory allocation
A_h = (double*) malloc(sizeof(double) * N * N);
B_h = (double*) malloc(sizeof(double) * N * N);
C_h = (double*) malloc(sizeof(double) * N * N);
C_cpu = (double*) malloc(sizeof(double) * N * N);
checkCudaErrors(cudaMalloc((void **) &A_d, sizeof(double) * N * N));
checkCudaErrors(cudaMalloc((void **) &B_d, sizeof(double) * N * N));
checkCudaErrors(cudaMalloc((void **) &C_d, sizeof(double) * N * N));
// Value initialization
init(A_h, B_h, C_h, N);
checkCudaErrors(cudaMemcpy(A_d, A_h,
sizeof(double) * N * N, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(B_d, B_h,
sizeof(double) * N * N, cudaMemcpyHostToDevice));
// Compute multiplication
t0 = get_elapsedtime();
mult(A_h, B_h, C_cpu, N);
t1 = get_elapsedtime();
cudaEvent_t ct[6];
for(int i = 0; i < 6; ++i)
cudaEventCreate(&ct[i]);
float cudaDuration[3];
int blockSize = 32;
dim3 dimBlock(blockSize, blockSize, 1);
dim3 dimGrid((N / blockSize) + 1, (N / blockSize) + 1, 1);
//int sharedMem = blockSize * blockSize * 2 * sizeof(double);
cudaEventRecord(ct[0]);
multk<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
cudaEventRecord(ct[1]);
cudaDeviceSynchronize();
getLastCudaError("Mult kernel 1 failed");
cudaEventRecord(ct[2]);
multk2<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
cudaEventRecord(ct[3]);
cudaDeviceSynchronize();
getLastCudaError("Mult kernel 2 failed");
cudaEventRecord(ct[4]);
//multk3<<<dimGrid, dimBlock, sharedMem>>>(A_d, B_d, C_d, N);
multk3<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N);
cudaEventRecord(ct[5]);
cudaDeviceSynchronize();
getLastCudaError("Mult kernel 3 failed");
cudaEventElapsedTime(&cudaDuration[0], ct[0], ct[1]);
cudaEventElapsedTime(&cudaDuration[1], ct[2], ct[3]);
cudaEventElapsedTime(&cudaDuration[2], ct[4], ct[5]);
checkCudaErrors(cudaMemcpy(C_h, C_d,
sizeof(double) * N * N, cudaMemcpyDeviceToHost));
verify_matrix(C_cpu, C_h, N);
// Pretty print
duration = (t1 - t0);
uint64_t N_u64 = (uint64_t) N;
uint64_t nb_op = N_u64 * N_u64 * N_u64;
fprintf(stdout, "CPU Performance results: \n");
fprintf(stdout, " Time: %lf s\n", duration);
fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6);
fprintf(stdout, "GPU Performance results 1: \n");
fprintf(stdout, " Time: %lf s\n", cudaDuration[0] / 1000);
fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[0]) * 1E-3);
fprintf(stdout, "GPU Performance results 2: \n");
fprintf(stdout, " Time: %lf s\n", cudaDuration[1] / 1000);
fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[1]) * 1E-3);
fprintf(stdout, "GPU Performance results 3: \n");
fprintf(stdout, " Time: %lf s\n", cudaDuration[2] / 1000);
fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[2]) * 1E-3);
return 0;
}
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions for initialization and error checking
#ifndef COMMON_HELPER_CUDA_H_
#define COMMON_HELPER_CUDA_H_
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif
#ifndef MIN
#define MIN(a, b) (a < b ? a : b)
#endif
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
void check(cudaError_t result, char const *const func, const char *const file,
int const line)
{
if (result)
{
fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func);
exit(EXIT_FAILURE);
}
}
inline void __getLastCudaError(const char *errorMessage, const char *file,
const int line)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr,
"%s(%i) : getLastCudaError() CUDA error :"
" %s : (%d) %s.\n",
file, line, errorMessage, (int)(err),
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
// Beginning of GPU Architecture definitions
int _ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine
// the # of cores per SM
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{0x80, 64},
{0x86, 128},
{0x87, 128},
{-1, -1}};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1)
{
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
{
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
// If we don't find the values, we default use the previous one
// to run properly
printf(
"MapSMtoCores for SM %d.%d is undefined."
" Default to use %d Cores/SM\n",
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
}
inline const char *_ConvertSMVer2ArchName(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine
// the GPU Arch name)
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
const char *name;
} sSMtoArchName;
sSMtoArchName nGpuArchNameSM[] = {
{0x30, "Kepler"},
{0x32, "Kepler"},
{0x35, "Kepler"},
{0x37, "Kepler"},
{0x50, "Maxwell"},
{0x52, "Maxwell"},
{0x53, "Maxwell"},
{0x60, "Pascal"},
{0x61, "Pascal"},
{0x62, "Pascal"},
{0x70, "Volta"},
{0x72, "Xavier"},
{0x75, "Turing"},
{0x80, "Ampere"},
{0x86, "Ampere"},
{-1, "Graphics Device"}};
int index = 0;
while (nGpuArchNameSM[index].SM != -1)
{
if (nGpuArchNameSM[index].SM == ((major << 4) + minor))
{
return nGpuArchNameSM[index].name;
}
index++;
}
// If we don't find the values, we default use the previous one
// to run properly
printf(
"MapSMtoArchName for SM %d.%d is undefined."
" Default to use %s\n",
major, minor, nGpuArchNameSM[index - 1].name);
return nGpuArchNameSM[index - 1].name;
}
// end of GPU Architecture definitions
// end of CUDA Helper Functions
#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
Q.13
Op = 4096 * 4096 * 4096 = 68719476736 = 69 Gfp32
Vr = 4096 * 4096 * 2 * 4 = 134217728 = 134 Mo
Vw = 4096 * 4096 * 4 = 67108864 = 67 Mo
mem théorique GPU: 56 Go/s
flop théorique: 2273 Gfp32
V = Vr + Vw = 134 + 67 = 201 Mo
Tdata = 201 / 56 = 3.589 ms
Tcompute = 68 / 2.273 = 29.92 ms
rho = 29.92 / 3.59 ) 8.334
Le plus gros du temps est passer sur les calculs.
Q.14
Performance réel: 9769.44 Mfp32
9.76944 / 2273 = 0.0043 = 0.43 % d'utilisation maximal des performances de calcules.
Q.15
Op = 4096 * 4096 * 4096 = 68719476736 = 69 Gfp32
Vr = 4096 * 4096 * 4096 * 2 * 4 = 549755813888 = 550 Go
Vw = 4096 * 4096 * 4 = 67108864 = 67 Mo
Le trafique mémoire au pire est de 550 Go.
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter