Skip to content
GitLab
Explorer
Connexion
Navigation principale
Rechercher ou aller à…
Projet
P
PGPU2023
Gestion
Activité
Membres
Labels
Programmation
Tickets
Tableaux des tickets
Jalons
Wiki
Wiki externe
Code
Requêtes de fusion
Dépôt
Branches
Validations
Étiquettes
Graphe du dépôt
Comparer les révisions
Extraits de code
Compilation
Pipelines
Jobs
Planifications de pipeline
Artéfacts
Déploiement
Releases
Registre de paquets
Registre de conteneur
Registre de modèles
Opération
Environnements
Modules Terraform
Surveillance
Incidents
Analyse
Données d'analyse des chaînes de valeur
Analyse des contributeurs
Données d'analyse CI/CD
Données d'analyse du dépôt
Expériences du modèle
Aide
Aide
Support
Documentation de GitLab
Comparer les forfaits GitLab
Forum de la communauté
Contribuer à GitLab
Donner votre avis
Raccourcis clavier
?
Extraits de code
Groupes
Projets
Afficher davantage de fils d'Ariane
Nicolas MARIE
PGPU2023
Validations
bcde70ef
Valider
bcde70ef
rédigé
1 year ago
par
Nicolas MARIE
Parcourir les fichiers
Options
Téléchargements
Correctifs
Plain Diff
TP2
parent
08448aef
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
Modifications
3
Masquer les modifications d'espaces
En ligne
Côte à côte
Affichage de
3 fichiers modifiés
TPs/TP2/CODE/Makefile
+18
-8
18 ajouts, 8 suppressions
TPs/TP2/CODE/Makefile
TPs/TP2/CODE/helper_cuda.h
+183
-0
183 ajouts, 0 suppression
TPs/TP2/CODE/helper_cuda.h
TPs/TP2/CODE/tp2.cu
+122
-42
122 ajouts, 42 suppressions
TPs/TP2/CODE/tp2.cu
avec
323 ajouts
et
50 suppressions
TPs/TP2/CODE/Makefile
+
18
−
8
Voir le fichier @
bcde70ef
CC
=
gcc
CFLAGS
=
-O3
-Wall
NVCC
=
nvcc
CFLAGS
=
-O3
-Wall
-Wextra
NVFLAGS
=
-O3
-I
.
-Xcompiler
=
"-Wall -Wextra -Werror -O3"
LDFLAGS
=
-lm
EXE
=
tp2.exe
TARGET
=
tp2cpu
TARGETGPU
=
tp2gpu
all
:
$(
EXE
)
all
:
$(
TARGET) $(TARGETGPU
)
$(
EXE)
:
tp2.o
$(
TARGET)
:
tp2
_cpu
.o
$(
CC
)
$(
CFLAGS
)
-o
$@
$<
$(
LDFLAGS
)
%.o
:
%.c
$(TARGETGPU)
:
tp2_gpu.o
$(
NVCC
)
$(
NVFLAGS
)
-o
$@
$<
$(
LDFLAGS
)
%_cpu.o
:
%.c
$(
CC
)
$(
CFLAGS
)
-c
-o
$@
$<
run
:
$(EXE)
./
$<
10240
%_gpu.o
:
%.cu
$(
NVCC
)
$(
NVFLAGS
)
-c
-o
$@
$<
run
:
$(TARGET) $(TARGETGPU)
./
$(
TARGET
)
10240
./
$(
TARGETGPU
)
10240
proper
:
rm
-f
*
.o
clean
:
rm
-f
*
.o
$(
EXE
)
rm
-f
*
.o
$(
TARGET
)
$(
TARGETGPU
)
Ce diff est replié.
Cliquez pour l'agrandir.
TPs/TP2/CODE/helper_cuda.h
0 → 100644
+
183
−
0
Voir le fichier @
bcde70ef
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions for initialization and error checking
#ifndef COMMON_HELPER_CUDA_H_
#define COMMON_HELPER_CUDA_H_
#pragma once
#include
<stdint.h>
#include
<stdio.h>
#include
<stdlib.h>
#include
<string.h>
#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif
#ifndef MIN
#define MIN(a, b) (a < b ? a : b)
#endif
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
void
check
(
cudaError_t
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
fprintf
(
stderr
,
"CUDA error at %s:%d code=%d (%s)
\"
%s
\"
\n
"
,
file
,
line
,
(
int
)
result
,
cudaGetErrorName
(
result
),
func
);
exit
(
EXIT_FAILURE
);
}
}
inline
void
__getLastCudaError
(
const
char
*
errorMessage
,
const
char
*
file
,
const
int
line
)
{
cudaError_t
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"%s(%i) : getLastCudaError() CUDA error :"
" %s : (%d) %s.
\n
"
,
file
,
line
,
errorMessage
,
(
int
)(
err
),
cudaGetErrorString
(
err
));
exit
(
EXIT_FAILURE
);
}
}
// Beginning of GPU Architecture definitions
int
_ConvertSMVer2Cores
(
int
major
,
int
minor
)
{
// Defines for GPU Architecture types (using the SM version to determine
// the # of cores per SM
typedef
struct
{
int
SM
;
// 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
int
Cores
;
}
sSMtoCores
;
sSMtoCores
nGpuArchCoresPerSM
[]
=
{
{
0x30
,
192
},
{
0x32
,
192
},
{
0x35
,
192
},
{
0x37
,
192
},
{
0x50
,
128
},
{
0x52
,
128
},
{
0x53
,
128
},
{
0x60
,
64
},
{
0x61
,
128
},
{
0x62
,
128
},
{
0x70
,
64
},
{
0x72
,
64
},
{
0x75
,
64
},
{
0x80
,
64
},
{
0x86
,
128
},
{
0x87
,
128
},
{
-
1
,
-
1
}};
int
index
=
0
;
while
(
nGpuArchCoresPerSM
[
index
].
SM
!=
-
1
)
{
if
(
nGpuArchCoresPerSM
[
index
].
SM
==
((
major
<<
4
)
+
minor
))
{
return
nGpuArchCoresPerSM
[
index
].
Cores
;
}
index
++
;
}
// If we don't find the values, we default use the previous one
// to run properly
printf
(
"MapSMtoCores for SM %d.%d is undefined."
" Default to use %d Cores/SM
\n
"
,
major
,
minor
,
nGpuArchCoresPerSM
[
index
-
1
].
Cores
);
return
nGpuArchCoresPerSM
[
index
-
1
].
Cores
;
}
inline
const
char
*
_ConvertSMVer2ArchName
(
int
major
,
int
minor
)
{
// Defines for GPU Architecture types (using the SM version to determine
// the GPU Arch name)
typedef
struct
{
int
SM
;
// 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
const
char
*
name
;
}
sSMtoArchName
;
sSMtoArchName
nGpuArchNameSM
[]
=
{
{
0x30
,
"Kepler"
},
{
0x32
,
"Kepler"
},
{
0x35
,
"Kepler"
},
{
0x37
,
"Kepler"
},
{
0x50
,
"Maxwell"
},
{
0x52
,
"Maxwell"
},
{
0x53
,
"Maxwell"
},
{
0x60
,
"Pascal"
},
{
0x61
,
"Pascal"
},
{
0x62
,
"Pascal"
},
{
0x70
,
"Volta"
},
{
0x72
,
"Xavier"
},
{
0x75
,
"Turing"
},
{
0x80
,
"Ampere"
},
{
0x86
,
"Ampere"
},
{
-
1
,
"Graphics Device"
}};
int
index
=
0
;
while
(
nGpuArchNameSM
[
index
].
SM
!=
-
1
)
{
if
(
nGpuArchNameSM
[
index
].
SM
==
((
major
<<
4
)
+
minor
))
{
return
nGpuArchNameSM
[
index
].
name
;
}
index
++
;
}
// If we don't find the values, we default use the previous one
// to run properly
printf
(
"MapSMtoArchName for SM %d.%d is undefined."
" Default to use %s
\n
"
,
major
,
minor
,
nGpuArchNameSM
[
index
-
1
].
name
);
return
nGpuArchNameSM
[
index
-
1
].
name
;
}
// end of GPU Architecture definitions
// end of CUDA Helper Functions
#endif // COMMON_HELPER_CUDA_H_
\ No newline at end of file
Ce diff est replié.
Cliquez pour l'agrandir.
TPs/TP2/CODE/tp2.cu
+
122
−
42
Voir le fichier @
bcde70ef
#include
<stdlib.h>
#include
<stdio.h>
#include
<math.h>
#include
<time.h>
#include
<inttypes.h>
#include
"helper_cuda.h"
#define gettime(t) clock_gettime(CLOCK_MONOTONIC_RAW, t)
#define get_sub_seconde(t) (1e-9*(double)t.tv_nsec)
#define NSTREAMS 4
double
get_elapsedtime
(
void
)
{
struct
timespec
st
;
int
err
=
gettime
(
&
st
);
if
(
err
!=
0
)
return
0
;
return
(
double
)
st
.
tv_sec
+
get_sub_seconde
(
st
);
}
void
verif
(
float
*
out
,
int
sz
)
{
float
err
=
0.
;
for
(
int
i
=
0
;
i
<
sz
;
i
++
)
{
err
+=
abs
(
out
[
i
]
-
exp
(
-
abs
(
sin
(
i
*
1.0
))
));
}
if
(
err
/
sz
<
1.e-4
)
{
fprintf
(
stdout
,
"TEST PASSED (error %3.f < 1.e-4)
\n
"
,
err
/
sz
);
}
else
{
fprintf
(
stderr
,
"TEST FAILED (error %3.f > 1.e-4)
\n
"
,
err
/
sz
);
}
float
err
=
0.
;
for
(
int
i
=
0
;
i
<
sz
;
i
++
)
{
err
+=
abs
(
out
[
i
]
-
exp
(
-
abs
(
sin
(
i
*
1.0
))
));
}
if
(
err
/
sz
<
1.e-4
)
{
fprintf
(
stdout
,
"TEST PASSED (error %3.
5
f < 1.e-4)
\n
"
,
err
/
sz
);
}
else
{
fprintf
(
stderr
,
"TEST FAILED (error %3.
5
f > 1.e-4)
\n
"
,
err
/
sz
);
}
}
void
func
(
float
*
out
,
int
size
)
{
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
out
[
i
]
=
exp
(
-
abs
(
out
[
i
])
);
}
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
out
[
i
]
=
exp
(
-
abs
(
out
[
i
])
);
}
}
__global__
void
funck
(
float
*
out
,
int
size
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
>=
size
)
return
;
out
[
index
]
=
exp
(
-
abs
(
out
[
index
]));
}
int
main
(
int
argc
,
char
**
argv
)
{
int
size
=
1024
;
if
(
argc
==
2
)
{
size
=
atoi
(
argv
[
1
]);
}
int
size
=
1024
;
if
(
argc
>=
2
)
{
size
=
atoi
(
argv
[
1
]);
}
int
nstreams
=
NSTREAMS
;
if
(
argc
>=
3
)
{
nstreams
=
atoi
(
argv
[
2
]);
}
int
subsize
=
size
/
nstreams
;
cudaStream_t
streams
[
nstreams
];
for
(
int
i
=
0
;
i
<
nstreams
;
++
i
)
{
checkCudaErrors
(
cudaStreamCreate
(
&
streams
[
i
]));
}
float
*
tab_h
=
NULL
;
float
*
tab_d
=
NULL
;
checkCudaErrors
(
cudaMallocHost
((
void
**
)
&
tab_h
,
sizeof
(
float
)
*
size
));
checkCudaErrors
(
cudaMalloc
((
void
**
)
&
tab_d
,
sizeof
(
float
)
*
size
));
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
tab_h
[
i
]
=
sin
(
i
*
1.
);
}
//func(tab_h, size);
//verif(tab_h, size);
int
blocksize
=
1024
;
dim3
dimBlock
(
blocksize
,
1
,
1
);
dim3
dimGrid
((
size
/
blocksize
/
nstreams
)
+
1
,
1
,
1
);
double
t0
,
t1
;
cudaEvent_t
ce
[
nstreams
*
2
];
for
(
int
i
=
0
;
i
<
nstreams
*
2
;
++
i
)
{
cudaEventCreate
(
&
ce
[
i
]);
}
t0
=
get_elapsedtime
();
size
*=
NSTREAMS
;
for
(
int
i
=
0
;
i
<
nstreams
;
++
i
)
{
checkCudaErrors
(
cudaMemcpyAsync
(
tab_d
+
(
subsize
*
i
),
tab_h
+
(
subsize
*
i
),
sizeof
(
float
)
*
subsize
,
cudaMemcpyHostToDevice
,
streams
[
i
]));
cudaEventRecord
(
ce
[
i
*
2
]);
funck
<<<
dimGrid
,
dimBlock
,
0
,
streams
[
i
]
>>>
(
tab_d
+
(
subsize
*
i
),
subsize
);
cudaEventRecord
(
ce
[
i
*
2
+
1
]);
checkCudaErrors
(
cudaMemcpyAsync
(
tab_h
+
(
subsize
*
i
),
tab_d
+
(
subsize
*
i
),
sizeof
(
float
)
*
subsize
,
cudaMemcpyDeviceToHost
,
streams
[
i
]));
}
float
*
tab
=
NULL
;
tab
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
size
);
cudaDeviceSynchronize
()
;
getLastCudaError
(
"funck kernel failed"
);
float
d_tab
;
cudaMalloc
(
t1
=
get_elapsedtime
();
if
(
tab
==
NULL
)
{
fprintf
(
stderr
,
"Bad allocation
\n
"
);
return
-
1
;
}
fprintf
(
stdout
,
"Total Time (%d): %lf ms
\n
"
,
nstreams
,
(
t1
-
t0
)
*
1000
);
float
cudaDuration
,
cudaTotalDuration
=
0
;
for
(
int
i
=
0
;
i
<
nstreams
;
++
i
){
cudaEventElapsedTime
(
&
cudaDuration
,
ce
[
i
*
2
],
ce
[
i
*
2
+
1
]);
cudaTotalDuration
+=
cudaDuration
;
fprintf
(
stdout
,
"Time of stream %d: %lf ms
\n
"
,
i
,
cudaDuration
);
}
fprintf
(
stdout
,
"Total Time of streams: %lf ms
\n
"
,
cudaTotalDuration
);
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
tab
[
i
]
=
sin
(
i
*
1.
);
}
verif
(
tab_h
,
size
);
func
(
tab
,
size
);
checkCudaErrors
(
cudaFree
(
tab_d
));
cudaFreeHost
(
tab_h
);
verif
(
tab
,
size
);
for
(
int
i
=
0
;
i
<
nstreams
;
++
i
)
{
checkCudaErrors
(
cudaStreamDestroy
(
streams
[
i
]));
}
free
(
tab
);
return
0
;
return
0
;
}
Ce diff est replié.
Cliquez pour l'agrandir.
Aperçu
0%
Chargement en cours
Veuillez réessayer
ou
joindre un nouveau fichier
.
Annuler
You are about to add
0
people
to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Enregistrer le commentaire
Annuler
Veuillez vous
inscrire
ou vous
se connecter
pour commenter