Skip to content
Extraits de code Groupes Projets
Valider 7ae17486 rédigé par Mickaël Boichot's avatar Mickaël Boichot
Parcourir les fichiers

Ajout de la correction du TD2

parent abd2dd1f
Branches
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
......@@ -5,14 +5,13 @@ CFLAGS=-O3 -Wall
EXE1=pi.exe
EXE2=pi_omp.exe
# EXE3=pi_task.exe
#EXE3=pi_cuda.exe
#EXE4=pi_cuda_shared.exe
#EXE5=pi_cuda_shared_2.exe
#EXE6=pi_cuda_shared_3.exe
#EXE7=pi_cuda_shared_4.exe
EXE3=pi_cuda.exe
EXE4=pi_cuda_shared.exe
EXE5=pi_cuda_shared_2.exe
EXE6=pi_cuda_shared_3.exe
EXE7=pi_cuda_shared_4.exe
all : $(EXE1) $(EXE2)
#$(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7)
all : $(EXE1) $(EXE2) $(EXE3) $(EXE4) $(EXE5) $(EXE6) $(EXE7)
$(EXE1) : pi_sequentiel.o
$(CC) $(CFLAGS) -o $@ $<
......@@ -20,9 +19,6 @@ $(EXE1) : pi_sequentiel.o
$(EXE2) : pi_omp.o
$(CC) $(CFLAGS_OMP) -o $@ $<
# $(EXE3) : pi_task.o
# $(CC) $(CFLAGS_OMP) -o $@ $<
$(EXE3) : pi_cuda.cu
$(CUDA_CC) -O3 -o $@ $<
......
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#include <curand_kernel.h>
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
/** return time in second
*/
double get_elapsedtime(void)
{
struct timespec st;
int err = gettime(&st);
if (err !=0) return 0;
return (double)st.tv_sec + get_sub_seconde(st);
}
/* QUESTION 3 */
#define TRIALS_PER_THREAD 4096
#define BLOCKS 512
#define THREADS 256
/* FIN QUESTION 3*/
/* QUESTION 6 */
__global__ void gpu_monte_carlo(float *estimate, curandState *states) {
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
int points_in_circle = 0;
float x = 0., y = 0.;
curand_init(2020, tid, 0, &states[tid]); // Initialize CURAND
for(int i = 0; i < TRIALS_PER_THREAD; i++)
{
x = curand_uniform (&states[tid]);
y = curand_uniform (&states[tid]);
points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
}
estimate[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
}
/* FIN QUESTION 6 */
int main (int argc, char *argv[]) {
float h_counts[BLOCKS * THREADS] = { 0 };
double t0 = 0., t1 = 0., duration = 0.;
printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
/* QUESTION 4 */
float *d_counts;
cudaMalloc((void **) &d_counts, BLOCKS * THREADS * sizeof(float)); // allocate device mem. for counts
/* FIN QUESTION 4 */
/* QUESTION 6 */
curandState *d_states;
cudaMalloc( (void **)&d_states, THREADS * BLOCKS * sizeof(curandState) );
/* FIN QUESTION 6 */
t0 = get_elapsedtime();
/* QUESTION 3 */
gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts, d_states);
/* FIN QUESTION 3*/
/* QUESTION 5 */
cudaMemcpy(h_counts, d_counts, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost); // return results
/* FIN QUESTION 5 */
float pi_gpu = 0.f;
for(int i = 0; i < BLOCKS * THREADS; i++)
{
pi_gpu += h_counts[i];
}
pi_gpu /= (BLOCKS * THREADS);
t1 = get_elapsedtime();
duration = (t1 - t0);
printf("GPU pi calculated in %lf s.\n", duration);
fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
return 0;
}
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#include <curand_kernel.h>
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
/** return time in second
*/
double get_elapsedtime(void)
{
struct timespec st;
int err = gettime(&st);
if (err !=0) return 0;
return (double)st.tv_sec + get_sub_seconde(st);
}
/* QUESTION 3 */
#define TRIALS_PER_THREAD 4096
#define BLOCKS 512
#define THREADS 256
/* FIN QUESTION 3*/
/* QUESTION 6 */
__global__ void gpu_monte_carlo(float *estimate) {
unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int tid = threadIdx.x;
int points_in_circle = 0;
float x = 0., y = 0.;
__shared__ float estimate_s[THREADS];
__shared__ curandState states_s[THREADS];
curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND
for(int i = 0; i < TRIALS_PER_THREAD; i++)
{
x = curand_uniform (&states_s[tid]);
y = curand_uniform (&states_s[tid]);
points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
}
estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
__syncthreads();
for (unsigned int s=1; s < THREADS; s*=2) {
if (tid % (2*s) == 0)
estimate_s[tid] += estimate_s[tid + s];
__syncthreads();
}
if (tid == 0)
estimate[blockIdx.x] = estimate_s[0];
}
/* FIN QUESTION 6 */
int main (int argc, char *argv[]) {
float h_counts[BLOCKS * THREADS] = { 0 };
double t0 = 0., t1 = 0., duration = 0.;
printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
/* QUESTION 4 */
float *d_counts;
cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
/* FIN QUESTION 4 */
t0 = get_elapsedtime();
/* QUESTION 3 */
gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
/* FIN QUESTION 3*/
/* QUESTION 5 */
cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results
/* FIN QUESTION 5 */
float pi_gpu = 0.f;
for(int i = 0; i < BLOCKS; i++)
{
pi_gpu += h_counts[i];
}
pi_gpu /= BLOCKS * THREADS;
t1 = get_elapsedtime();
duration = (t1 - t0);
printf("GPU pi calculated in %lf s.\n", duration);
fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
return 0;
}
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#include <curand_kernel.h>
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
/** return time in second
*/
double get_elapsedtime(void)
{
struct timespec st;
int err = gettime(&st);
if (err !=0) return 0;
return (double)st.tv_sec + get_sub_seconde(st);
}
/* QUESTION 3 */
#define TRIALS_PER_THREAD 4096
#define BLOCKS 512
#define THREADS 256
/* FIN QUESTION 3*/
/* QUESTION 6 */
__global__ void gpu_monte_carlo(float *estimate) {
unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int tid = threadIdx.x;
int points_in_circle = 0;
float x = 0., y = 0.;
__shared__ float estimate_s[THREADS];
__shared__ curandState states_s[THREADS];
curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND
for(int i = 0; i < TRIALS_PER_THREAD; i++)
{
x = curand_uniform (&states_s[tid]);
y = curand_uniform (&states_s[tid]);
points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
}
estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
__syncthreads();
for (unsigned int s=1; s < THREADS; s*=2) {
int index = 2 * s * tid;
if (index < THREADS)
estimate_s[tid] += estimate_s[tid + s];
__syncthreads();
}
if (tid == 0)
estimate[blockIdx.x] = estimate_s[0];
}
/* FIN QUESTION 6 */
int main (int argc, char *argv[]) {
float h_counts[BLOCKS * THREADS] = { 0 };
double t0 = 0., t1 = 0., duration = 0.;
printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
/* QUESTION 4 */
float *d_counts;
cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
/* FIN QUESTION 4 */
t0 = get_elapsedtime();
/* QUESTION 3 */
gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
/* FIN QUESTION 3*/
/* QUESTION 5 */
cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results
/* FIN QUESTION 5 */
float pi_gpu = 0.f;
for(int i = 0; i < BLOCKS; i++)
{
pi_gpu += h_counts[i];
}
pi_gpu /= BLOCKS * THREADS;
t1 = get_elapsedtime();
duration = (t1 - t0);
printf("GPU pi calculated in %lf s.\n", duration);
fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
return 0;
}
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#include <curand_kernel.h>
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
/** return time in second
*/
double get_elapsedtime(void)
{
struct timespec st;
int err = gettime(&st);
if (err !=0) return 0;
return (double)st.tv_sec + get_sub_seconde(st);
}
/* QUESTION 3 */
#define TRIALS_PER_THREAD 4096
#define BLOCKS 512
#define THREADS 256
/* FIN QUESTION 3*/
/* QUESTION 6 */
__global__ void gpu_monte_carlo(float *estimate) {
unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int tid = threadIdx.x;
int points_in_circle = 0;
float x = 0., y = 0.;
__shared__ float estimate_s[THREADS];
__shared__ curandState states_s[THREADS];
curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND
for(int i = 0; i < TRIALS_PER_THREAD; i++)
{
x = curand_uniform (&states_s[tid]);
y = curand_uniform (&states_s[tid]);
points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
}
estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
__syncthreads();
for (unsigned int s=THREADS/2; s>0; s>>=1){
if (tid < s)
estimate_s[tid] += estimate_s[tid + s];
__syncthreads();
}
if (tid == 0)
estimate[blockIdx.x] = estimate_s[0];
}
/* FIN QUESTION 6 */
int main (int argc, char *argv[]) {
float h_counts[BLOCKS * THREADS] = { 0 };
double t0 = 0., t1 = 0., duration = 0.;
printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
/* QUESTION 4 */
float *d_counts;
cudaMalloc((void **) &d_counts, BLOCKS * sizeof(float)); // allocate device mem. for counts
/* FIN QUESTION 4 */
t0 = get_elapsedtime();
/* QUESTION 3 */
gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
/* FIN QUESTION 3*/
/* QUESTION 5 */
cudaMemcpy(h_counts, d_counts, BLOCKS * sizeof(float), cudaMemcpyDeviceToHost); // return results
/* FIN QUESTION 5 */
float pi_gpu = 0.f;
for(int i = 0; i < BLOCKS; i++)
{
pi_gpu += h_counts[i];
}
pi_gpu /= BLOCKS * THREADS;
t1 = get_elapsedtime();
duration = (t1 - t0);
printf("GPU pi calculated in %lf s.\n", duration);
fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
return 0;
}
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#include <curand_kernel.h>
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
/** return time in second
*/
double get_elapsedtime(void)
{
struct timespec st;
int err = gettime(&st);
if (err !=0) return 0;
return (double)st.tv_sec + get_sub_seconde(st);
}
/* QUESTION 3 */
#define TRIALS_PER_THREAD 4096
#define BLOCKS 512
#define THREADS 256
/* FIN QUESTION 3*/
/* QUESTION 6 */
__global__ void gpu_monte_carlo(float *estimate) {
unsigned int gtid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int tid = threadIdx.x;
int points_in_circle = 0;
float x = 0., y = 0.;
__shared__ float estimate_s[THREADS];
__shared__ curandState states_s[THREADS];
curand_init(2020, gtid, 0, &states_s[tid]); // Initialize CURAND
for(int i = 0; i < TRIALS_PER_THREAD; i++)
{
x = curand_uniform (&states_s[tid]);
y = curand_uniform (&states_s[tid]);
points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.
}
estimate_s[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD; // return estimate of pi
__syncthreads();
for (unsigned int s=THREADS/2; s > 0; s>>=1) {
if (tid < s)
estimate_s[tid] += estimate_s[tid + s];
__syncthreads();
}
if (tid == 0)
atomicAdd(estimate, estimate_s[0]);
}
/* FIN QUESTION 6 */
int main (int argc, char *argv[]) {
float h_counts = 0;
double t0 = 0., t1 = 0., duration = 0.;
printf("# of trials per thread = %d, # of blocks = %d, # of threads/block = %d.\n", TRIALS_PER_THREAD, BLOCKS, THREADS);
/* QUESTION 4 */
float *d_counts;
cudaMalloc((void **) &d_counts, sizeof(float)); // allocate device mem. for counts
/* FIN QUESTION 4 */
t0 = get_elapsedtime();
/* QUESTION 3 */
gpu_monte_carlo<<<BLOCKS, THREADS>>>(d_counts);
/* FIN QUESTION 3*/
/* QUESTION 5 */
cudaMemcpy(&h_counts, d_counts, sizeof(float), cudaMemcpyDeviceToHost); // return results
/* FIN QUESTION 5 */
float pi_gpu = h_counts / (BLOCKS * THREADS);
t1 = get_elapsedtime();
duration = (t1 - t0);
printf("GPU pi calculated in %lf s.\n", duration);
fprintf(stdout, "Pi ~= %lf\n", pi_gpu);
return 0;
}
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter