diff --git a/TPs/TP0/CODE/device_query/prog.cu b/TPs/TP0/CODE/device_query/prog.cu index 07d8b357d74230ad627ba43f6524a30b12e5f751..d717ee3b94e62d8ff1c31e2bd227a9ee025440d0 100644 --- a/TPs/TP0/CODE/device_query/prog.cu +++ b/TPs/TP0/CODE/device_query/prog.cu @@ -25,8 +25,8 @@ int main(int argc, char **argv) { printf(" GPU Clock Rate (MHz): %d\n", prop.clockRate/1000); printf(" Memory Clock Rate (MT/s): %d\n", prop.memoryClockRate/1000); printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth); - printf(" Memory Band width (Gio/s): %d\n", - (((prop.memoryClockRate / 1024) * (prop.memoryBusWidth / 8)) / 1024) * 2); + printf(" Memory Band width (Go/s): %d\n", + (((prop.memoryClockRate / 1000) * (prop.memoryBusWidth / 8)) / 1000)); printf(" CUDA Cores/MP: %d\n", _ConvertSMVer2Cores(prop.major, prop.minor)); printf(" CUDA Cores: %d\n", _ConvertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount); diff --git a/TPs/TP1/CODE/Partie2/Makefile b/TPs/TP1/CODE/Partie2/Makefile index e34c97c2b64e4a3d9d15d1b7791002bcad7197cd..4ec4d51a39cdf5450329fdbfd54335faaa144de8 100755 --- a/TPs/TP1/CODE/Partie2/Makefile +++ b/TPs/TP1/CODE/Partie2/Makefile @@ -23,7 +23,7 @@ run_cpu: $(TARGET) ./$(TARGET) 2000 run_gpu: $(TARGET) - ./$(TARGETGPU) 1024 + ./$(TARGETGPU) 2048 proper : rm -f *.o diff --git a/TPs/TP1/CODE/Partie2/dgemm.cu b/TPs/TP1/CODE/Partie2/dgemm.cu index eaea2550af035b224b79e5f74b6ea30e88239446..d31c37d6e137765420577c3c4e7ca991696eef9e 100755 --- a/TPs/TP1/CODE/Partie2/dgemm.cu +++ b/TPs/TP1/CODE/Partie2/dgemm.cu @@ -7,6 +7,8 @@ #define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) #define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +#define BLOCK_SIZE 32 + #include "helper_cuda.h" /** return time in second @@ -87,8 +89,8 @@ __global__ void multk(double *A, double *B, double *C, int N) __global__ void multk2(double *A, double *B, double *C, int N) { - const int i = blockIdx.y * blockDim.y + threadIdx.y; const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; if (i >= N || j >= N) return; @@ -102,14 +104,14 @@ __global__ void multk2(double *A, double *B, double *C, int N) __global__ void multk3(double *A, double *B, double *C, int N) { - const int blockSize = 32; - __shared__ double A_s[32 * 32]; - __shared__ double B_s[32 * 32]; + const int blockSize = BLOCK_SIZE; + __shared__ double A_s[BLOCK_SIZE * BLOCK_SIZE]; + __shared__ double B_s[BLOCK_SIZE * BLOCK_SIZE]; - const int i = blockIdx.y * blockSize + threadIdx.y; - const int j = blockIdx.x * blockSize + threadIdx.x; + const int x = blockIdx.x * blockSize + threadIdx.x; + const int y = blockIdx.y * blockSize + threadIdx.y; - if (i >= N || j >= N) + if (x >= N || y >= N) return; double sum = 0; @@ -118,9 +120,9 @@ __global__ void multk3(double *A, double *B, double *C, int N) int k = z + threadIdx.x; int l = z + threadIdx.y; if (k < N) - A_s[threadIdx.y * blockDim.x + threadIdx.x] = A[i * N + k]; + A_s[threadIdx.y * blockDim.x + threadIdx.x] = A[y * N + k]; if (l < N) - B_s[threadIdx.y * blockDim.x + threadIdx.x] = B[l * N + j]; + B_s[threadIdx.y * blockDim.x + threadIdx.x] = B[l * N + x]; __syncthreads(); for (int m = 0; m < blockSize; ++m){ @@ -131,7 +133,40 @@ __global__ void multk3(double *A, double *B, double *C, int N) __syncthreads(); } - C[i * N + j] = sum; + C[y * N + x] = sum; +} + +__global__ void d_shared_mult(double* A, double* B, double* C, int size) +{ + __shared__ double s_A[1024]; + __shared__ double s_B[1024]; + + unsigned x = blockDim.x*blockIdx.x + threadIdx.x; + unsigned y = blockDim.y*blockIdx.y + threadIdx.y; + //pos in memory is x + y*size + + int nsize = size/blockDim.x; //we suppose it is squared + double sum = 0.0; + + for (int i = 0; i < nsize; i++) + { + //LOADING IN SHARED MEMORY + s_A[threadIdx.x + threadIdx.y*blockDim.x] + = A[(i*blockDim.x + blockIdx.y*gridDim.x*blockDim.x*blockDim.y) + +(threadIdx.x + threadIdx.y*blockDim.x*gridDim.x)]; + + s_B[threadIdx.x + threadIdx.y*blockDim.x] + = B[(blockIdx.x*blockDim.x + i*gridDim.x*blockDim.x*blockDim.y) + +(threadIdx.x + threadIdx.y*blockDim.x*gridDim.x)]; + + __syncthreads(); + for (int j = 0; j < blockDim.x; j++) + { + sum += s_A[j + threadIdx.y*blockDim.x] * s_B[threadIdx.x + j*blockDim.x]; + } + __syncthreads(); + } + C[x + y*size] = sum; } int main(int argc, char** argv){ @@ -148,7 +183,7 @@ int main(int argc, char** argv){ double t0 = 0., t1 = 0., duration = 0.; - N = (argc < 2) ? 1000 : atoi(argv[1]); + N = (argc < 2) ? 1024 : atoi(argv[1]); fprintf(stdout, "Matrix Multiplication\n Size: %dx%d\n", N, N); // Memory allocation @@ -168,76 +203,98 @@ int main(int argc, char** argv){ checkCudaErrors(cudaMemcpy(B_d, B_h, sizeof(double) * N * N, cudaMemcpyHostToDevice)); - // Compute multiplication - + + + uint64_t N_u64 = (uint64_t) N; + uint64_t nb_op = N_u64 * N_u64 * N_u64; + + + + // CPU Compute multiplication t0 = get_elapsedtime(); mult(A_h, B_h, C_cpu, N); t1 = get_elapsedtime(); - cudaEvent_t ct[6]; - - for(int i = 0; i < 6; ++i) - cudaEventCreate(&ct[i]); + duration = (t1 - t0); - float cudaDuration[3]; + fprintf(stdout, "CPU Performance results: \n"); + fprintf(stdout, " Time: %lf s\n", duration); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); - int blockSize = 32; + + + // CUDA PREP + cudaEvent_t ct0; + cudaEvent_t ct1; + cudaEventCreate(&ct0); + cudaEventCreate(&ct1); + float cudaDuration; + + int blockSize = BLOCK_SIZE; dim3 dimBlock(blockSize, blockSize, 1); dim3 dimGrid((N / blockSize) + 1, (N / blockSize) + 1, 1); //int sharedMem = blockSize * blockSize * 2 * sizeof(double); - cudaEventRecord(ct[0]); + + + cudaEventRecord(ct0); multk<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N); - cudaEventRecord(ct[1]); + cudaEventRecord(ct1); cudaDeviceSynchronize(); getLastCudaError("Mult kernel 1 failed"); + cudaEventElapsedTime(&cudaDuration, ct0, ct1); + checkCudaErrors(cudaMemcpy(C_h, C_d, + sizeof(double) * N * N, cudaMemcpyDeviceToHost)); + fprintf(stdout, "GPU Performance results 1: \n"); + fprintf(stdout, " Time: %lf s\n", cudaDuration / 1000); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3); + verify_matrix(C_cpu, C_h, N); - - cudaEventRecord(ct[2]); + + + cudaEventRecord(ct0); multk2<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N); - cudaEventRecord(ct[3]); + cudaEventRecord(ct1); cudaDeviceSynchronize(); getLastCudaError("Mult kernel 2 failed"); + cudaEventElapsedTime(&cudaDuration, ct0, ct1); + checkCudaErrors(cudaMemcpy(C_h, C_d, + sizeof(double) * N * N, cudaMemcpyDeviceToHost)); + fprintf(stdout, "GPU Performance results 2: \n"); + fprintf(stdout, " Time: %lf s\n", cudaDuration / 1000); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3); + verify_matrix(C_cpu, C_h, N); - cudaEventRecord(ct[4]); + cudaEventRecord(ct0); //multk3<<<dimGrid, dimBlock, sharedMem>>>(A_d, B_d, C_d, N); multk3<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N); - cudaEventRecord(ct[5]); + cudaEventRecord(ct1); cudaDeviceSynchronize(); getLastCudaError("Mult kernel 3 failed"); - - - - cudaEventElapsedTime(&cudaDuration[0], ct[0], ct[1]); - cudaEventElapsedTime(&cudaDuration[1], ct[2], ct[3]); - cudaEventElapsedTime(&cudaDuration[2], ct[4], ct[5]); - - checkCudaErrors(cudaMemcpy(C_h, C_d, + cudaEventElapsedTime(&cudaDuration, ct0, ct1); + checkCudaErrors(cudaMemcpy(C_h, C_d, sizeof(double) * N * N, cudaMemcpyDeviceToHost)); - + fprintf(stdout, "GPU Performance results 3: \n"); + fprintf(stdout, " Time: %lf s\n", cudaDuration / 1000); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3); verify_matrix(C_cpu, C_h, N); - // Pretty print - duration = (t1 - t0); - uint64_t N_u64 = (uint64_t) N; - uint64_t nb_op = N_u64 * N_u64 * N_u64; - fprintf(stdout, "CPU Performance results: \n"); - fprintf(stdout, " Time: %lf s\n", duration); - fprintf(stdout, " MFlops: %.2f\n", (nb_op / duration)*1E-6); - fprintf(stdout, "GPU Performance results 1: \n"); - fprintf(stdout, " Time: %lf s\n", cudaDuration[0] / 1000); - fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[0]) * 1E-3); - - fprintf(stdout, "GPU Performance results 2: \n"); - fprintf(stdout, " Time: %lf s\n", cudaDuration[1] / 1000); - fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[1]) * 1E-3); + cudaEventRecord(ct0); + d_shared_mult<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, N); + cudaEventRecord(ct1); + cudaDeviceSynchronize(); + getLastCudaError("Mult kernel Anzo failed"); + cudaEventElapsedTime(&cudaDuration, ct0, ct1); + checkCudaErrors(cudaMemcpy(C_h, C_d, + sizeof(double) * N * N, cudaMemcpyDeviceToHost)); - fprintf(stdout, "GPU Performance results 3: \n"); - fprintf(stdout, " Time: %lf s\n", cudaDuration[2] / 1000); - fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration[2]) * 1E-3); + fprintf(stdout, "GPU Performance results Anzo: \n"); + fprintf(stdout, " Time: %lf s\n", cudaDuration / 1000); + fprintf(stdout, " MFlops: %.2f\n", (nb_op / cudaDuration) * 1E-3); + verify_matrix(C_cpu, C_h, N); return 0; }