diff --git a/Cours/APM_Cours2.pdf b/Cours/APM_Cours2.pdf index c118429dc01c63ebb0dfe621ae3cf8eb1f060447..ff476756eed647ff381ab38f753a8bba80a19560 100644 Binary files a/Cours/APM_Cours2.pdf and b/Cours/APM_Cours2.pdf differ diff --git a/TDs/TD1/CORRECTION/Partie2/td1_p2.c b/TDs/TD1/CORRECTION/Partie2/td1_p2.c new file mode 100644 index 0000000000000000000000000000000000000000..2c935b1b9407042fa0d9f083a5bc696447728392 --- /dev/null +++ b/TDs/TD1/CORRECTION/Partie2/td1_p2.c @@ -0,0 +1,86 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <string.h> + + +void kernel(double *a, double *b, double *c, int N, int dimBlock, int blockId, int threadId) +{ + + int i, j, k; + i = blockId * dimBlock + threadId; + + if (i < N) + { + c[i] = a[i] + b[i]; + } +} + +int main(int argc, char **argv) +{ + int N = 1000; + int sz_in_bytes = N*sizeof(double); + + double *h_a, *h_b, *h_c; + double *d_a, *d_b, *d_c; + + h_a = (double*)malloc(sz_in_bytes); + h_b = (double*)malloc(sz_in_bytes); + h_c = (double*)malloc(sz_in_bytes); + + // Initiate values on h_a and h_b + for(int i = 0 ; i < N ; i++) + { + h_a[i] = 1./(1.+i); + h_b[i] = (i-1.)/(i+1.); + } + + // 3-arrays allocation on device +// cudaMalloc((void**)&d_a, sz_in_bytes); +// cudaMalloc((void**)&d_b, sz_in_bytes); +// cudaMalloc((void**)&d_c, sz_in_bytes); + d_a = (double*)malloc(sz_in_bytes); + d_b = (double*)malloc(sz_in_bytes); + d_c = (double*)malloc(sz_in_bytes); + + + // copy on device values pointed on host by h_a and h_b + // (the new values are pointed by d_a et d_b on device) + +// cudaMemcpy(d_a, h_a, sz_in_bytes, cudaMemcpyHostToDevice); +// cudaMemcpy(d_b, h_b, sz_in_bytes, cudaMemcpyHostToDevice); + memcpy(d_a, h_a, sz_in_bytes); + memcpy(d_b, h_b, sz_in_bytes); + + //dim3 dimBlock(64, 1, 1); + //dim3 dimGrid((N + dimBlock.x - 1)/dimBlock.x, 1, 1); + int dimBlock = (64); + int dimGrid = ((N + 64 - 1)/64); + + for(int blockId = 0; blockId < dimGrid; ++blockId) + { + for(int tid = 0; tid < dimBlock; ++tid) + { + kernel(d_a, d_b, d_c, N, dimBlock, blockId, tid); + } + } + + // Result is pointed by d_c on device + // Copy this result on host (result pointed by h_c on host) +// cudaMemcpy(h_c, d_c, sz_in_bytes, cudaMemcpyDeviceToHost); + memcpy(h_c, d_c, sz_in_bytes); + + // freeing on device +// cudaFree(d_a); +// cudaFree(d_b); +// cudaFree(d_c); + free(d_a); + free(d_b); + free(d_c); + + free(h_a); + free(h_b); + free(h_c); + + return 0; +} diff --git a/TDs/TD1/CORRECTION/Partie2/td2.c b/TDs/TD1/CORRECTION/Partie2/td2.c new file mode 100644 index 0000000000000000000000000000000000000000..e331b0a3953c0d48b4306d14c782c93ae101dfe0 --- /dev/null +++ b/TDs/TD1/CORRECTION/Partie2/td2.c @@ -0,0 +1,86 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> + + +void kernel(double *a, double *b, double *c, int N, int dimGrid, int dimBlock) +{ + + int i, j, k; + + for(j=0; j<dimGrid; j++) + { + for(k=0; k<dimBlock; k++) + { + i = j * dimBlock + k; + if (i < N) + { + c[i] = a[i] + b[i]; + } + } + } +} + +int main(int argc, char **argv) +{ + int N = 1000; + int sz_in_bytes = N*sizeof(double); + + double *h_a, *h_b, *h_c; + double *d_a, *d_b, *d_c; + + h_a = (double*)malloc(sz_in_bytes); + h_b = (double*)malloc(sz_in_bytes); + h_c = (double*)malloc(sz_in_bytes); + + // Initiate values on h_a and h_b + for(int i = 0 ; i < N ; i++) + { + h_a[i] = 1./(1.+i); + h_b[i] = (i-1.)/(i+1.); + } + + // 3-arrays allocation on device +// cudaMalloc((void**)&d_a, sz_in_bytes); +// cudaMalloc((void**)&d_b, sz_in_bytes); +// cudaMalloc((void**)&d_c, sz_in_bytes); + d_a = (double*)malloc(sz_in_bytes); + d_b = (double*)malloc(sz_in_bytes); + d_c = (double*)malloc(sz_in_bytes); + + + // copy on device values pointed on host by h_a and h_b + // (the new values are pointed by d_a et d_b on device) + +// cudaMemcpy(d_a, h_a, sz_in_bytes, cudaMemcpyHostToDevice); +// cudaMemcpy(d_b, h_b, sz_in_bytes, cudaMemcpyHostToDevice); + memcpy(d_a, h_a, sz_in_bytes); + memcpy(d_b, h_b, sz_in_bytes); + + dim3 dimBlock(64, 1, 1); + dim3 dimGrid((N + dimBlock.x - 1)/dimBlock.x, 1, 1); + int dimBlock = (64); + int dimGrid = ((N + 64 - 1)/64); + + kernel(d_a, d_b, d_c, N, dimGrid, dimBlock); + + // Result is pointed by d_c on device + // Copy this result on host (result pointed by h_c on host) +// cudaMemcpy(h_c, d_c, sz_in_bytes, cudaMemcpyDeviceToHost); + memcpy(h_c, d_c, sz_in_bytes); + + // freeing on device +// cudaFree(d_a); +// cudaFree(d_b); +// cudaFree(d_c); + free(d_a); + free(d_b); + free(d_c); + + + free(h_a); + free(h_b); + free(h_c); + + return 0; +} diff --git a/TDs/TD1/SUJET/td1.pdf b/TDs/TD1/SUJET/td1.pdf index 2b625b1e169022ff40e61423f10e8c6349033f7c..c409c6b0c0cc64db65c2a1de90c43fe94f2f4ceb 100644 Binary files a/TDs/TD1/SUJET/td1.pdf and b/TDs/TD1/SUJET/td1.pdf differ diff --git a/TDs/TD1/SUJET/td1_slide.pdf b/TDs/TD1/SUJET/td1_slide.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a963410bce4d5231fb29f16344b9d865a35511d0 Binary files /dev/null and b/TDs/TD1/SUJET/td1_slide.pdf differ diff --git a/TDs/TD2/CORRECTION/Makefile b/TDs/TD2/CORRECTION/Makefile index b51970c246b010e4eff4e06c30f78f7212bc8cc7..2e07ac4f537854305082a4a968a760f15963edce 100755 --- a/TDs/TD2/CORRECTION/Makefile +++ b/TDs/TD2/CORRECTION/Makefile @@ -5,14 +5,13 @@ CFLAGS=-O3 -Wall EXE1=pi.exe EXE2=pi_omp.exe # EXE3=pi_task.exe -#EXE3=pi_cuda.exe -#EXE4=pi_cuda_shared.exe -#EXE5=pi_cuda_shared_2.exe -#EXE6=pi_cuda_shared_3.exe -#EXE7=pi_cuda_shared_4.exe +EXE3=pi_cuda.exe +EXE4=pi_cuda_shared.exe +EXE5=pi_cuda_shared_2.exe +EXE6=pi_cuda_shared_3.exe +EXE7=pi_cuda_shared_4.exe -all : $(EXE1) $(EXE2) -#$(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) +all : $(EXE1) $(EXE2) $(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7) $(EXE1) : pi_sequentiel.o $(CC) $(CFLAGS) -o $@ $< @@ -20,9 +19,6 @@ $(EXE1) : pi_sequentiel.o $(EXE2) : pi_omp.o $(CC) $(CFLAGS_OMP) -o $@ $< -# $(EXE3) : pi_task.o -# $(CC) $(CFLAGS_OMP) -o $@ $< - $(EXE3) : pi_cuda.cu $(CUDA_CC) -O3 -o $@ $< diff --git a/TDs/TD2/CORRECTION/pi_cuda.cu b/TDs/TD2/CORRECTION/pi_cuda.cu new file mode 100755 index 0000000000000000000000000000000000000000..7762ce1806913235615124be84680dfd6720b2f5 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda.cu @@ -0,0 +1,84 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate, curandState *states) { + unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + + curand_init(2020, tid, 0, &states[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states[tid]); + y = curand_uniform (&states[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * THREADS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + +/* QUESTION 6 */ + curandState *d_states; + cudaMalloc( (void **)&d_states, THREADS * BLOCKS * sizeof(curandState) ); +/* FIN QUESTION 6 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts, d_states); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS * THREADS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= (BLOCKS * THREADS); + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared.cu b/TDs/TD2/CORRECTION/pi_cuda_shared.cu new file mode 100755 index 0000000000000000000000000000000000000000..53212f22f7af28aa646b7564efef5b26b37ea822 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared.cu @@ -0,0 +1,92 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=1; s < THREADS; s*=2) { + if (tid % (2*s) == 0) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + estimate[blockIdx.x] = estimate_s[0]; +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= BLOCKS * THREADS; + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu new file mode 100755 index 0000000000000000000000000000000000000000..6b8a10556e63b09eb366b4e209997d98612ce4a0 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared_2.cu @@ -0,0 +1,93 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=1; s < THREADS; s*=2) { + int index = 2 * s * tid; + if (index < THREADS) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + estimate[blockIdx.x] = estimate_s[0]; +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= BLOCKS * THREADS; + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu new file mode 100755 index 0000000000000000000000000000000000000000..8213f5af128395c5730800990fc8e5c9a2a2bf55 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared_3.cu @@ -0,0 +1,92 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=THREADS/2; s>0; s>>=1){ + if (tid < s) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + estimate[blockIdx.x] = estimate_s[0]; +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts[BLOCKS * THREADS] = { 0 }; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = 0.f; + for(int i = 0; i < BLOCKS; i++) + { + pi_gpu += h_counts[i]; + } + + pi_gpu /= BLOCKS * THREADS; + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu b/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu new file mode 100755 index 0000000000000000000000000000000000000000..3a92945d90f641cd3d890a794fa0057bc69c1594 --- /dev/null +++ b/TDs/TD2/CORRECTION/pi_cuda_shared_4.cu @@ -0,0 +1,86 @@ +#include <stdlib.h> +#include <stdio.h> +#include <cuda.h> +#include <math.h> +#include <time.h> +#include <curand_kernel.h> + +#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t) +#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec) +/** return time in second +*/ +double get_elapsedtime(void) +{ + struct timespec st; + int err = gettime(&st); + if (err !=0) return 0; + return (double)st.tv_sec + get_sub_seconde(st); +} + +/* QUESTION 3 */ +#define TRIALS_PER_THREAD 4096 +#define BLOCKS 512 +#define THREADS 256 +/* FIN QUESTION 3*/ + +/* QUESTION 6 */ +__global__ void gpu_monte_carlo(float *estimate) { + unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + int points_in_circle = 0; + float x = 0., y = 0.; + __shared__ float estimate_s[THREADS]; + __shared__ curandState states_s[THREADS]; + + curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND + + for(int i = 0; i < TRIALS_PER_THREAD; i++) + { + x = curand_uniform (&states_s[tid]); + y = curand_uniform (&states_s[tid]); + points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle. + } + estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi + __syncthreads(); + + for (unsigned int s=THREADS/2; s > 0; s>>=1) { + if (tid < s) + estimate_s[tid] += estimate_s[tid + s]; + __syncthreads(); + } + + if (tid == 0) + atomicAdd(estimate, estimate_s[0]); +} +/* FIN QUESTION 6 */ + +int main (int argc, char *argv[]) { + float h_counts = 0; + double t0 = 0., t1 = 0., duration = 0.; + + printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS); + +/* QUESTION 4 */ + float *d_counts; + cudaMalloc((void **) &d_counts, sizeof(float)); // allocate device mem. for counts +/* FIN QUESTION 4 */ + + t0 = get_elapsedtime(); +/* QUESTION 3 */ + gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts); +/* FIN QUESTION 3*/ + +/* QUESTION 5 */ + cudaMemcpy(&h_counts, d_counts, sizeof(float), cudaMemcpyDeviceToHost); // return results +/* FIN QUESTION 5 */ + + float pi_gpu = h_counts / (BLOCKS * THREADS); + + t1 = get_elapsedtime(); + duration = (t1 - t0); + + printf("GPU pi calculated in %lf s.\n", duration); + fprintf(stdout, "Pi ~= %lf\n", pi_gpu); + + return 0; +} diff --git a/TDs/TD2/SUJET/td2.pdf b/TDs/TD2/SUJET/td2.pdf index 1f9480b357c5e3333f44d550ed2e0ef18113b5b1..c38d9f4cb310957d923abd90d94b241127af1175 100644 Binary files a/TDs/TD2/SUJET/td2.pdf and b/TDs/TD2/SUJET/td2.pdf differ diff --git a/TPs/TP0/CODE/common/helper_cuda.h b/TPs/TP0/CODE/common/helper_cuda.h index 3dd446dced85de9d10eb9f1597c56099fa358bdb..77ca0bf8a3091a41d462238e6b8cda540b6868c3 100644 --- a/TPs/TP0/CODE/common/helper_cuda.h +++ b/TPs/TP0/CODE/common/helper_cuda.h @@ -46,17 +46,16 @@ #define MIN(a, b) (a < b ? a : b) #endif -#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) +#define checkCudaErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); } -void check(cudaError_t result, char const *const func, const char *const file, - int const line) +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=false) { - if (result) - { - fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func); - exit(EXIT_FAILURE); - } + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } } inline void __getLastCudaError(const char *errorMessage, const char *file, @@ -180,4 +179,4 @@ inline const char *_ConvertSMVer2ArchName(int major, int minor) // end of CUDA Helper Functions -#endif // COMMON_HELPER_CUDA_H_ \ No newline at end of file +#endif // COMMON_HELPER_CUDA_H_ diff --git a/TPs/TP0/CODE/compute-sanitizer/Makefile b/TPs/TP0/CODE/compute-sanitizer/Makefile index 2283121ddd828e6e4f8750965ef592866b4acec1..098d87b12f2989aa389f02fddd4ca65df28bd525 100644 --- a/TPs/TP0/CODE/compute-sanitizer/Makefile +++ b/TPs/TP0/CODE/compute-sanitizer/Makefile @@ -1,8 +1,8 @@ CC = nvcc -lineinfo -INCLUDES = +INCLUDES = -I../common # the build target executable: -EXE = memcheck.exe initcheck.exe +EXE = memcheck.exe initcheck.exe out_of_bound.exe all: $(EXE) diff --git a/TPs/TP0/CODE/compute-sanitizer/memcheck.cu b/TPs/TP0/CODE/compute-sanitizer/memcheck.cu index ad3fc59b16be296946be73084b4f19bb676c52fe..7cccf6ab69581d4cc18f137c90e460279a6be50e 100644 --- a/TPs/TP0/CODE/compute-sanitizer/memcheck.cu +++ b/TPs/TP0/CODE/compute-sanitizer/memcheck.cu @@ -4,7 +4,7 @@ __global__ void saxpy(int n, float a, float *x, float *y) { int i = blockIdx.x * blockDim.x + threadIdx.x; - y[i] = a * x[i] + y[i]; + if (i <= n) y[i] = a * x[i] + y[i]; } int main(void) { diff --git a/TPs/TP0/CODE/compute-sanitizer/out_of_bound.cu b/TPs/TP0/CODE/compute-sanitizer/out_of_bound.cu new file mode 100644 index 0000000000000000000000000000000000000000..7ceaad48461a65759886060934cb5dd4fd5a4672 --- /dev/null +++ b/TPs/TP0/CODE/compute-sanitizer/out_of_bound.cu @@ -0,0 +1,13 @@ +#include "helper_cuda.h" + +__global__ void k1(char *d) { d[41 + 128] = 0; } + +int main() { + char *d; + checkCudaErrors(cudaMalloc(&d, 42)); + + k1<<<1, 1>>>(d); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); + +} diff --git a/TPs/TP0/CODE/compute-sanitizer/tags b/TPs/TP0/CODE/compute-sanitizer/tags index f41805ff57b31cfd930077730d066b2145261de1..3ca8122258a249f317cd0d4fe0ea2e13ff376c4e 100644 --- a/TPs/TP0/CODE/compute-sanitizer/tags +++ b/TPs/TP0/CODE/compute-sanitizer/tags @@ -4,6 +4,6 @@ !_TAG_PROGRAM_NAME Exuberant Ctags // !_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/ !_TAG_PROGRAM_VERSION 5.9~svn20110310 // -CC Makefile /^CC = nvcc$/;" m -EXE Makefile /^EXE = memcheck.exe initcheck.exe $/;" m -INCLUDES Makefile /^INCLUDES = $/;" m +CC Makefile /^CC = nvcc -lineinfo$/;" m +EXE Makefile /^EXE = memcheck.exe initcheck.exe out_of_bound.exe $/;" m +INCLUDES Makefile /^INCLUDES = -I..\/common $/;" m diff --git a/TPs/TP0/CODE/device_query/Makefile b/TPs/TP0/CODE/device_query/Makefile index 73b63d518ef203ed6ec1b19cbec0c553bc630a18..32e68d57b8d6fb95157ba9072e94601681308929 100644 --- a/TPs/TP0/CODE/device_query/Makefile +++ b/TPs/TP0/CODE/device_query/Makefile @@ -1,5 +1,8 @@ +#*********************************************************** +# Created by Patricio Bulic, Davor Sluga UL FRI on 15/6/2022 +#*********************************************************** CC = nvcc -INCLUDES = -I../common +INCLUDES = -I. # the build target executable: TARGET = prog diff --git a/TPs/TP0/CODE/error_checking/exemple1.cu b/TPs/TP0/CODE/error_checking/exemple1.cu index 58d7a39197d61cebf8e174c86dde4d5c902214e4..963dc57662debaecf5def59991db80d4aa338a63 100755 --- a/TPs/TP0/CODE/error_checking/exemple1.cu +++ b/TPs/TP0/CODE/error_checking/exemple1.cu @@ -3,14 +3,19 @@ #include "helper_cuda.h" #define THREADS 1024 -//#define THREADS 4096 -#define TAB_SIZE 8192 +//#define THREADS 2048 +#define TAB_SIZE 100000 __global__ void kernel(int *a, int *b, int *c) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < TAB_SIZE) c[tid] = a[tid] + b[tid]; } +__global__ void init(int *a, int value) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < TAB_SIZE) a[tid] = value; +} + int main(int argc, char **argv) { int sz_in_bytes = sizeof(int) * TAB_SIZE; @@ -22,18 +27,23 @@ int main(int argc, char **argv) // Allocation on host (malloc) h_c = (int *)malloc(sz_in_bytes); + // Kernel configuration + dim3 dimBlock(THREADS, 1, 1); + dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1); + // Allocation on device (cudaMalloc) checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes)); checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes)); checkCudaErrors(cudaMalloc((void **)&d_c, sz_in_bytes)); - checkCudaErrors(cudaMemset(d_a, 1, sz_in_bytes)); - checkCudaErrors(cudaMemset(d_b, 2, sz_in_bytes)); - cudaDeviceSynchronize(); + init<<<dimGrid, dimBlock>>>(d_a, 1); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); + + init<<<dimGrid, dimBlock>>>(d_b, 2); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); - // Kernel configuration - dim3 dimBlock(THREADS, 1, 1); - dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1); // Kernel launch kernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c); diff --git a/TPs/TP0/CODE/error_checking/exemple1.exe b/TPs/TP0/CODE/error_checking/exemple1.exe deleted file mode 100755 index 92ff871b850ef8a9c61848b3b09fc28a55715118..0000000000000000000000000000000000000000 Binary files a/TPs/TP0/CODE/error_checking/exemple1.exe and /dev/null differ diff --git a/TPs/TP0/CODE/error_checking/exemple2.cu b/TPs/TP0/CODE/error_checking/exemple2.cu index f863dde23bdab204fd7aead195abe78a7876fe9e..ff6ccfb971fe255327eab462d38571a558887671 100755 --- a/TPs/TP0/CODE/error_checking/exemple2.cu +++ b/TPs/TP0/CODE/error_checking/exemple2.cu @@ -2,34 +2,43 @@ #include <stdlib.h> #include "helper_cuda.h" -#define THREADS 256 -#define TAB_SIZE 8192 +#define THREADS 64 +#define TAB_SIZE 1000 __global__ void copy(int *a, int *b) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid <= TAB_SIZE) b[tid] = a[tid]; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < TAB_SIZE) b[tid] = a[tid]; } +__global__ void init(int *a, int value) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < TAB_SIZE) a[tid] = value; +} + + int main(int argc, char **argv) { - int sz_in_bytes = sizeof(int) * TAB_SIZE; + int sz_in_bytes = sizeof(int) * TAB_SIZE; int *h_b; int res = 0; int *d_a, *d_b, *d_c; - // Allocation on host (malloc) - h_b = (int *)malloc(sz_in_bytes); + // Allocation on host (malloc) + h_b = (int *)malloc(sz_in_bytes); - // Allocation on device (cudaMalloc) - checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes)); + // Allocation on device (cudaMalloc) + checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes)); + // suppose we forget this + // checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes)); - checkCudaErrors(cudaMemset(d_a, 1, sz_in_bytes)); + // Kernel configuration + dim3 dimBlock(THREADS, 1, 1); + dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1); - // Kernel configuration - dim3 dimBlock(THREADS, 1, 1); - dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1); + init<<<dimGrid, dimBlock>>>(d_a, 1); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); // Kernel launch copy<<<dimGrid, dimBlock>>>(d_a, d_b); @@ -37,8 +46,8 @@ int main(int argc, char **argv) checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("copy kernel error after sync"); - // Retrieving data from device (cudaMemcpy) - checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost)); + // Retrieving data from device (cudaMemcpy) + checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost)); // checking if cudamalloc is still available checkCudaErrors(cudaMalloc((void **)&d_c, sz_in_bytes)); @@ -48,19 +57,23 @@ int main(int argc, char **argv) checkCudaErrors(cudaFree(d_a)); checkCudaErrors(cudaFree(d_b)); - // computing sum of tab element - for (int i = 0; i < TAB_SIZE; i++) res += h_b[i]; + // Freeing on device (cudaFree) + checkCudaErrors(cudaFree(d_a)); + checkCudaErrors(cudaFree(d_b)); + + // computing sum of tab element + for (int i = 0; i < TAB_SIZE; i++) res += h_b[i]; - // Verifying if - if (res == TAB_SIZE) { - fprintf(stderr, "TEST PASSED !\n"); - } - else - { - fprintf(stderr, "TEST FAILED !\n"); - } + // Verifying if + if (res == TAB_SIZE) { + fprintf(stderr, "TEST PASSED !\n"); + } + else + { + fprintf(stderr, "TEST FAILED !\n"); + } - free(h_b); + free(h_b); - return 0; + return 0; } diff --git a/TPs/TP0/CODE/error_checking/exemple2.exe b/TPs/TP0/CODE/error_checking/exemple2.exe deleted file mode 100755 index ea89892735b6df2a2e60d443515ec9de4c026f5a..0000000000000000000000000000000000000000 Binary files a/TPs/TP0/CODE/error_checking/exemple2.exe and /dev/null differ diff --git a/TPs/TP0/CODE/error_checking/exemple3.cu b/TPs/TP0/CODE/error_checking/exemple3.cu index d2a0c2ce1dd4eb0e8f55c29b89402f59146c1ef2..e8cd6a045f54f66412c1d6db9335489da58e666d 100755 --- a/TPs/TP0/CODE/error_checking/exemple3.cu +++ b/TPs/TP0/CODE/error_checking/exemple3.cu @@ -2,61 +2,72 @@ #include <stdlib.h> #include "helper_cuda.h" -#define THREADS 256 -#define TAB_SIZE 8192 +#define THREADS 64 +#define TAB_SIZE 1000 __global__ void copy(int *a, int *b) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < TAB_SIZE) b[tid] = a[tid]; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < TAB_SIZE) b[tid] = a[tid]; } +__global__ void init(int *a, int value) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < TAB_SIZE) a[tid] = value; +} + + int main(int argc, char **argv) { - int sz_in_bytes = sizeof(int) * TAB_SIZE; + int sz_in_bytes = sizeof(int) * TAB_SIZE; + + int *h_b; + int res = 0; + int *d_a, *d_b; - int *h_b; - int res = 0; - int *d_a, *d_b; + // Allocation on host (malloc) + h_b = (int *)malloc(sz_in_bytes); - // Allocation on host (malloc) - h_b = (int *)malloc(sz_in_bytes); + // Too big allocation on device + checkCudaErrors(cudaMalloc((void **)&d_a, 100000000000)); - // Too big allocation on device - checkCudaErrors(cudaMalloc((void **)&d_a, 100000000000)); + // Allocation on device (cudaMalloc) + checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes)); - // Allocation on device (cudaMalloc) - checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes)); + // Kernel configuration + dim3 dimBlock(THREADS, 1, 1); + dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1); - checkCudaErrors(cudaMemset(d_a, 1, sz_in_bytes)); + init<<<dimGrid, dimBlock>>>(d_a, 1); - // Kernel configuration - dim3 dimBlock(THREADS, 1, 1); - dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); - // Kernel launch - copy<<<dimGrid, dimBlock>>>(d_a, d_b); + // Kernel launch + copy<<<dimGrid, dimBlock>>>(d_a, d_b); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); - // Retrieving data from device (cudaMemcpy) - checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost)); + // Retrieving data from device (cudaMemcpy) + checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost)); - // Freeing on device (cudaFree) - checkCudaErrors(cudaFree(d_a)); - checkCudaErrors(cudaFree(d_b)); + // Freeing on device (cudaFree) + checkCudaErrors(cudaFree(d_a)); + checkCudaErrors(cudaFree(d_b)); - // computing sum of tab element - for (int i = 0; i < TAB_SIZE; i++) res += h_b[i]; + // computing sum of tab element + for (int i = 0; i < TAB_SIZE; i++) res += h_b[i]; - // Verifying if - if (res == TAB_SIZE) { - fprintf(stderr, "TEST PASSED !\n"); - } - else - { - fprintf(stderr, "TEST FAILED !\n"); - } + // Verifying if + if (res == TAB_SIZE) { + fprintf(stderr, "TEST PASSED !\n"); + } + else + { + fprintf(stderr, "TEST FAILED !\n"); + } - free(h_b); + free(h_b); - return 0; + return 0; } diff --git a/TPs/TP0/CODE/error_checking/exemple3.exe b/TPs/TP0/CODE/error_checking/exemple3.exe deleted file mode 100755 index 247981307c9bfe2e5aa932c7eeea35b63056f71b..0000000000000000000000000000000000000000 Binary files a/TPs/TP0/CODE/error_checking/exemple3.exe and /dev/null differ diff --git a/TPs/TP0/CORRECTION/device_query/Makefile b/TPs/TP0/CORRECTION/device_query/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5264cf81a352e7036ca6fa7565b7736350881936 --- /dev/null +++ b/TPs/TP0/CORRECTION/device_query/Makefile @@ -0,0 +1,17 @@ +CC = nvcc +INCLUDES = -I. + +# the build target executable: +TARGET = prog + +all: $(TARGET) + +$(TARGET): $(TARGET).o + $(CC) $(TARGET).o -o $@ + +$(TARGET).o: $(TARGET).cu + $(CC) $(INCLUDES) $(TARGET).cu -c -o $@ + +clean: + $(RM) $(TARGET) $(TARGET).o + diff --git a/TPs/TP0/CORRECTION/device_query/helper_cuda.h b/TPs/TP0/CORRECTION/device_query/helper_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..3dd446dced85de9d10eb9f1597c56099fa358bdb --- /dev/null +++ b/TPs/TP0/CORRECTION/device_query/helper_cuda.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +#ifndef MIN +#define MIN(a, b) (a < b ? a : b) +#endif + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +void check(cudaError_t result, char const *const func, const char *const file, + int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d (%s) \"%s\" \n", file, line, (int)result, cudaGetErrorName(result), func); + exit(EXIT_FAILURE); + } +} + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, (int)(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// Beginning of GPU Architecture definitions +int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char *_ConvertSMVer2ArchName(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char *name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) + { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} +// end of GPU Architecture definitions + +// end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ \ No newline at end of file diff --git a/TPs/TP0/CORRECTION/device_query/prog.cu b/TPs/TP0/CORRECTION/device_query/prog.cu new file mode 100644 index 0000000000000000000000000000000000000000..9700ec2d363038a2667245aec5f9706ded410b48 --- /dev/null +++ b/TPs/TP0/CORRECTION/device_query/prog.cu @@ -0,0 +1,115 @@ +#include <stdio.h> +#include <cuda_runtime.h> +#include <cuda.h> + +#include "helper_cuda.h" + +int main(int argc, char **argv) { + + // Get number of GPUs + int deviceCount = 0; + cudaError_t error = cudaGetDeviceCount(&deviceCount); + + if (error != cudaSuccess) { + printf("cudaGetDeviceCount error %d\n-> %s\n", error, cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + // Get device propreties and print + for (int dev = 0; dev < deviceCount; dev++) { + struct cudaDeviceProp prop; + int value; + printf("\n========== cudaDeviceGetProperties ============ \n"); + cudaGetDeviceProperties(&prop, dev); + printf("\nDevice %d: \"%s\"\n", dev, prop.name); + printf(" GPU Clock Rate (MHz): %d\n", prop.clockRate/1000); + printf(" Memory Clock Rate (MHz): %d\n", prop.memoryClockRate/1000); + printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth); + printf(" Peak Memory Bandwidth (GB/s): %.2f\n", + 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6); + printf(" CUDA Cores/MP: %d\n", _ConvertSMVer2Cores(prop.major, prop.minor)); + printf(" CUDA Cores: %d\n", _ConvertSMVer2Cores(prop.major, prop.minor) * + prop.multiProcessorCount); + printf(" Peak TFLOPS: %.1f\n", + (_ConvertSMVer2Cores(prop.major, prop.minor) * + 2.0 * prop.multiProcessorCount * prop.clockRate * 1000.0)/ 1.0e12); + printf(" Total amount of global memory: %.0f GB\n", prop.totalGlobalMem / 1073741824.0f); + printf(" Total amount of shared memory per block: %zu kB\n", + prop.sharedMemPerBlock/1024); + printf(" Total number of registers available per block: %d\n", + prop.regsPerBlock); + printf(" Warp size: %d\n", + prop.warpSize); + printf(" Maximum number of threads per block: %d\n", + prop.maxThreadsPerBlock); + printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", + prop.maxThreadsDim[0], prop.maxThreadsDim[1], + prop.maxThreadsDim[2]); + printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", + prop.maxGridSize[0], prop.maxGridSize[1], + prop.maxGridSize[2]); + + printf("\n\n========== cudaDeviceGetAttribute ============ \n"); + printf("\nDevice %d: \"%s\"\n", dev, prop.name); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxThreadsPerBlock, dev); + printf(" Max number of threads per block: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlockDimX, dev); + printf(" Max block dimension X: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlockDimY, dev); + printf(" Max block dimension Y: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlockDimZ, dev); + printf(" Max block dimension Z: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxGridDimX, dev); + printf(" Max grid dimension X: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxGridDimY, dev); + printf(" Max grid dimension Y: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxGridDimZ, dev); + printf(" Max grid dimension Z: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxSharedMemoryPerBlock, dev); + printf(" Max shared memory per block: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrWarpSize, dev); + printf(" Warp size: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrClockRate, dev); + printf(" Peak clock frequency in kilohertz: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMemoryClockRate, dev); + printf(" Peak memory clock frequency in kilohertz: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrGlobalMemoryBusWidth, dev); + printf(" Global memory bus width in bits: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrL2CacheSize, dev); + printf(" Size of L2 cache in bytes: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxThreadsPerMultiProcessor, dev); + printf(" Maximum resident threads per SM: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrComputeCapabilityMajor, dev); + printf(" Major compute capability version number: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrComputeCapabilityMinor, dev); + printf(" Minor compute capability version number: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev); + printf(" Max shared memory per SM in bytes: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxRegistersPerMultiprocessor, dev); + printf(" Max number of 32-bit registers per SM: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + printf(" Max per block shared mem size on the device: %d\n", + value); + cudaDeviceGetAttribute (&value, cudaDevAttrMaxBlocksPerMultiprocessor, dev); + printf(" Max thread blocks that can reside on a SM: %d\n", + value); + } +} diff --git a/TPs/TP0/CORRECTION/device_query/tags b/TPs/TP0/CORRECTION/device_query/tags new file mode 100644 index 0000000000000000000000000000000000000000..74edb7e8f6999f738763c9a1b17486ab326d5c8b --- /dev/null +++ b/TPs/TP0/CORRECTION/device_query/tags @@ -0,0 +1,18 @@ +!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/ +!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/ +!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/ +!_TAG_PROGRAM_NAME Exuberant Ctags // +!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/ +!_TAG_PROGRAM_VERSION 5.9~svn20110310 // +CC Makefile /^CC = nvcc$/;" m +COMMON_HELPER_CUDA_H_ helper_cuda.h 32;" d +INCLUDES Makefile /^INCLUDES = -I.$/;" m +MAX helper_cuda.h 42;" d +MIN helper_cuda.h 46;" d +TARGET Makefile /^TARGET = prog$/;" m +_ConvertSMVer2ArchName helper_cuda.h /^inline const char *_ConvertSMVer2ArchName(int major, int minor)$/;" f +_ConvertSMVer2Cores helper_cuda.h /^int _ConvertSMVer2Cores(int major, int minor)$/;" f +__getLastCudaError helper_cuda.h /^inline void __getLastCudaError(const char *errorMessage, const char *file,$/;" f +check helper_cuda.h /^void check(cudaError_t result, char const *const func, const char *const file,$/;" f +checkCudaErrors helper_cuda.h 49;" d +getLastCudaError helper_cuda.h 50;" d diff --git a/TPs/TP0/CORRECTION/error_checking/exemple2.cu b/TPs/TP0/CORRECTION/error_checking/exemple2.cu new file mode 100755 index 0000000000000000000000000000000000000000..783478f375efe2ff0eaaf295f1d9fe3bf9d1a595 --- /dev/null +++ b/TPs/TP0/CORRECTION/error_checking/exemple2.cu @@ -0,0 +1,72 @@ +#include <stdio.h> +#include <stdlib.h> +#include "helper_cuda.h" + +#define THREADS 64 +#define TAB_SIZE 1000 + +__global__ void copy(int *a, int *b) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < TAB_SIZE) b[tid] = a[tid]; +} + +__global__ void init(int *a, int value) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < TAB_SIZE) a[tid] = value; +} + + +int main(int argc, char **argv) +{ + int sz_in_bytes = sizeof(int) * TAB_SIZE; + + int *h_b; + int res = 0; + int *d_a, *d_b, *d_dummy; + + // Allocation on host (malloc) + h_b = (int *)malloc(sz_in_bytes); + + // Allocation on device (cudaMalloc) + checkCudaErrors(cudaMalloc((void **)&d_a, sz_in_bytes)); + // suppose we forget this + // checkCudaErrors(cudaMalloc((void **)&d_b, sz_in_bytes)); + + // Kernel configuration + dim3 dimBlock(THREADS, 1, 1); + dim3 dimGrid(TAB_SIZE / THREADS + 1, 1, 1); + + init<<<dimGrid, dimBlock>>>(d_a, 1); + + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); + + // Kernel launch + copy<<<dimGrid, dimBlock>>>(d_a, d_b); + checkCudaErrors(cudaMalloc((void **)&d_dummy, sz_in_bytes)); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaDeviceSynchronize()); + + // Retrieving data from device (cudaMemcpy) + checkCudaErrors(cudaMemcpy(h_b, d_b, sz_in_bytes, cudaMemcpyDeviceToHost)); + + // Freeing on device (cudaFree) + checkCudaErrors(cudaFree(d_a)); + checkCudaErrors(cudaFree(d_b)); + + // computing sum of tab element + for (int i = 0; i < TAB_SIZE; i++) res += h_b[i]; + + // Verifying if + if (res == TAB_SIZE) { + fprintf(stderr, "TEST PASSED !\n"); + } + else + { + fprintf(stderr, "TEST FAILED !\n"); + } + + free(h_b); + + return 0; +} diff --git a/TPs/TP0/CORRECTION/tags b/TPs/TP0/CORRECTION/tags new file mode 100644 index 0000000000000000000000000000000000000000..8de213abb5cdc780fe0f2a444355bf977dbb19d1 --- /dev/null +++ b/TPs/TP0/CORRECTION/tags @@ -0,0 +1,18 @@ +!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/ +!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/ +!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/ +!_TAG_PROGRAM_NAME Exuberant Ctags // +!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/ +!_TAG_PROGRAM_VERSION 5.9~svn20110310 // +CC device_query/Makefile /^CC = nvcc$/;" m +COMMON_HELPER_CUDA_H_ device_query/helper_cuda.h 32;" d +INCLUDES device_query/Makefile /^INCLUDES = -I.$/;" m +MAX device_query/helper_cuda.h 42;" d +MIN device_query/helper_cuda.h 46;" d +TARGET device_query/Makefile /^TARGET = prog$/;" m +_ConvertSMVer2ArchName device_query/helper_cuda.h /^inline const char *_ConvertSMVer2ArchName(int major, int minor)$/;" f +_ConvertSMVer2Cores device_query/helper_cuda.h /^int _ConvertSMVer2Cores(int major, int minor)$/;" f +__getLastCudaError device_query/helper_cuda.h /^inline void __getLastCudaError(const char *errorMessage, const char *file,$/;" f +check device_query/helper_cuda.h /^void check(cudaError_t result, char const *const func, const char *const file,$/;" f +checkCudaErrors device_query/helper_cuda.h 49;" d +getLastCudaError device_query/helper_cuda.h 50;" d diff --git a/TPs/TP0/SUJET/tp0.pdf b/TPs/TP0/SUJET/tp0.pdf index 9061738fe9e2dcb8996cd01a87506887027f81d9..b81d4ccc5c85f52bb976921c8d02fc19c164fc82 100644 Binary files a/TPs/TP0/SUJET/tp0.pdf and b/TPs/TP0/SUJET/tp0.pdf differ