diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c76ddb39a2a58768e8b27d16f7e4ca0b08cce0fe --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +dataset.zip +dataset/* + diff --git a/download_dataset.sh b/download_dataset.sh new file mode 100755 index 0000000000000000000000000000000000000000..2c1c5d1f149092831986227fc41927e9fa8042cd --- /dev/null +++ b/download_dataset.sh @@ -0,0 +1,3 @@ +wget "https://storage.googleapis.com/kaggle-data-sets/3384322/6207733/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240519%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240519T145233Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1ac8f9216a239f62f3aa19666ce2b09c188d1d34d5199cf254a3677292e1b893eb10d0e2280baf0cbfb1f21d38a2b99f55e3e080beaa4a376d07326750503e15f35e123e2efd21c2c300a82c5bc06c787528bbe5e0d6b7be5a31bc0e6fb458b9a59456233fb852c658827d1dd547ca683890de508dd88940526568357bdd28611409ed5db0e479abf7b6f98855cd942d0cebfae55d463f288640c594bce7e11cd9f460e941cec80a7713e7faa54e69e3e9c4e9e3cd87b11bc35aa74439f96f80c2d592c6a97519353ca099d62e7276bec190a99e9327aee45ab9531d86f8f6be65fb3931148dbd4342712849494a71adcfe0b4eb54051582393fe8a98ebf68bc" -c -O 'dataset.zip' +mkdir dataset +unzip dataset.zip -d dataset diff --git a/rapport.txt b/rapport.txt deleted file mode 100644 index b13e01b9400a0961a64fd593130f37407deeb98f..0000000000000000000000000000000000000000 --- a/rapport.txt +++ /dev/null @@ -1,122 +0,0 @@ -rapport : - - -Partie 1 : Hadoop - -cd ensiie/exo/opensearch -vi recup-bano-opensearch - --> en dessous du commentaire, i in {91..93} -on vérifie le contenu de bano-data : i y a bien les 3 départements demandés - - -nouveau terminal -cd ~/ensiie/exo/hadoop/ -./start-hadoop-ensiie.sh -•Attendre environ 20 secondes et le relancer une seconde fois : -./start-hadoop-ensiie.sh - - -docker run -it -v /home/tessa/ensiie/exo/opensearch/bano-data:/ensiie/tpnote namenode bash -> ça marche pas pck déjà un volume - - -cp -r ~/ensiie/exo/opensearch/bano-data ~/ensiie/exo/files pck montage déjà fait (sinon docker cp) - -docker exec -it namenode bash - -cd /data/hdfs/formation/bano-data -cf copie écran - -hdfs dfs -put /data/hdfs/formation/bano-data /ensiie/tpnote/ -cf screen -> pb bizarre avec le /, ça trouvait pas mes fichiers sur hive - - - -partie 2 : Hive - -docker exec -it hive-server bash -(une fois dans le conteneur) -/opt/hive/bin/beeline -u jdbc:hive2://hive-server:10000 - -1) -hive> -CREATE DATABASE IF NOT EXISTS ensiie; -USE ensiie; - -2) -CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.] table_name -[(col_name data_type [COMMENT col_comment], ...)] -[COMMENT table_comment] -[ROW FORMAT row_format] -[STORED AS file_format] - -cf site bano - -CREATE EXTERNAL TABLE IF NOT EXISTS bano -( id STRING, - numero STRING, - voie STRING, -code_postal INT, -nom_commune STRING, -source STRING, -latitude DOUBLE, -longitude DOUBLE) -ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' -WITH SERDEPROPERTIES ( - "separatorChar" = ",", - "quoteChar" = "\"" -) -STORED AS textfile -LOCATION '/ensiie/tpnote/bano-data'; - - -SELECT * FROM bano LIMIT 10; - -cf screen - - -3) -CREATE TABLE IF NOT EXISTS banorc -( id STRING, - numero STRING, - voie STRING, -code_postal INT, -nom_commune STRING, -source STRING, -latitude DOUBLE, -longitude DOUBLE) -STORED AS ORC; -INSERT INTO TABLE banorc -SELECT * FROM bano; - -4) -select count (*) from bano ; -select count (*) from banorc ; - -756087 -1,364s VS 0,11s - -5) -SELECT SUBSTR(id, 1, 2) AS departement, COUNT(DISTINCT nom_commune) AS nombre_de_communes -FROM bano -GROUP BY SUBSTR(id, 1, 2) -ORDER BY departement; - -select count(distinct nom_commune) from bano; - - -Spark : -cd /spark/bin -./spark-shell --driver-memory 2G - -val lines = sc.textFile("hdfs://namenode:9000/ensiie/tpnote/bano-data/bano-91.csv") -val words = lines.flatMap(_.split("\\s+")) -val wc = words.map(w => (w, 1)).reduceByKey(_ + _) -val sorted_words = wc.sortBy(-_._2).take(30) - - -3) OpenSearch Dashboard - -cd ensiie/exo/hadoop/ -./stop-hadoop-ensiie.sh -cd ../opensearch