diff --git a/hadoop/.env b/hadoop/.env new file mode 100644 index 0000000000000000000000000000000000000000..192c1afc92f7778a76f2a1d3cc349d99b907d029 --- /dev/null +++ b/hadoop/.env @@ -0,0 +1,5 @@ +CLUSTER_NAME=ensiie +ADMIN_NAME=centos +ADMIN_PASSWORD=ensiie +INSTALL_PYTHON=true # whether you want python or not (to run hadoop streaming) +INSTALL_SQOOP=false diff --git a/hadoop/docker-compose-ensiie-v3.yml b/hadoop/docker-compose-ensiie-v3.yml new file mode 100644 index 0000000000000000000000000000000000000000..3620de8a121005ae4612763313c2092e73977ce6 --- /dev/null +++ b/hadoop/docker-compose-ensiie-v3.yml @@ -0,0 +1,196 @@ +services: + namenode: + image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop + container_name: namenode + volumes: + - hadoop_namenode:/hadoop/dfs/name + - ~/ensiie/exo/files:/data/hdfs/formation + - ~/ensiie/exo/opensearch/bano-data:/data/hdfs/bano-data + environment: + - CLUSTER_NAME=test + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + ports: + - 9870:9870 + - 9000:9000 + + datanode1: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode1 + networks: + - hadoop + volumes: + - hadoop_datanode_1:/hadoop/dfs/data + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "namenode:9870" + deploy: + mode: global + datanode2: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode2 + networks: + - hadoop + volumes: + - hadoop_datanode_2:/hadoop/dfs/data + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "namenode:9870" + deploy: + mode: global + + + resourcemanager: + image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 + container_name: resourcemanager + networks: + - hadoop + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 datanode2:9864" + env_file: + - ./hadoop.env + volumes: + - ~/ensiie/exo/files:/data/yarn/formation + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + healthcheck: + disable: true + nodemanager1: + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + container_name: nodemanager1 + networks: + - hadoop + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: global + nodemanager2: + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + container_name: nodemanager2 + networks: + - hadoop + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode2:9864 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: global + historyserver: + image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 + container_name: historyserver + networks: + - hadoop + volumes: + - hadoop_historyserver:/hadoop/yarn/timeline + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 datanode2:9864 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + ports: + - 28188:8188 + + hive-server: + image: bde2020/hive:2.3.2-postgresql-metastore + container_name: hive-server + env_file: + - ./hadoop-hive.env + volumes: + - ~/ensiie/exo/files:/data/hive/formation + environment: + HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore" + SERVICE_PRECONDITION: "hive-metastore:9083" + ports: + - "10000:10000" + networks: + - hadoop + + hive-metastore: + image: bde2020/hive:2.3.2-postgresql-metastore + container_name: hive-metastore + env_file: + - ./hadoop-hive.env + command: /opt/hive/bin/hive --service metastore + environment: + SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 hive-metastore-postgresql:5432" + ports: + - "9083:9083" + networks: + - hadoop + + hive-metastore-postgresql: + image: bde2020/hive-metastore-postgresql:2.3.0 + container_name: hive-metastore-postgresql + env_file: + - ./hadoop-hive.env + networks: + - hadoop + ports: + - "5432:5432" + + spark-master: + image: bde2020/spark-master:3.1.1-hadoop3.2 + container_name: spark-master + volumes: + - ~/ensiie/exo/files:/data/spark/files + - ~/ensiie/exo/python:/data/spark/python + ports: + - "28083:8080" + - "7077:7077" + environment: + - INIT_DAEMON_STEP=setup_spark + networks: + - hadoop + spark-worker-1: + image: bde2020/spark-worker:3.1.1-hadoop3.2 + container_name: spark-worker-1 + depends_on: + - spark-master + ports: + - "28081:8081" + environment: + - "SPARK_MASTER=spark://spark-master:7077" + networks: + - hadoop + spark-worker-2: + image: bde2020/spark-worker:3.1.1-hadoop3.2 + container_name: spark-worker-2 + depends_on: + - spark-master + ports: + - "28082:8081" + environment: + - "SPARK_MASTER=spark://spark-master:7077" + networks: + - hadoop + +volumes: + hadoop_datanode_1: + hadoop_datanode_2: + hadoop_namenode: + hadoop_historyserver: + +networks: + hadoop: + diff --git a/hadoop/docker-compose-que-hadoop.yml b/hadoop/docker-compose-que-hadoop.yml new file mode 100644 index 0000000000000000000000000000000000000000..7c2e71e0da0d7b86c47f664c14d68c4c6510b07e --- /dev/null +++ b/hadoop/docker-compose-que-hadoop.yml @@ -0,0 +1,61 @@ +version: "3" + +services: + namenode: + image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop + container_name: namenode + volumes: + - hadoop_namenode:/hadoop/dfs/name + - ~/ensiie/exo/files:/data/hdfs/formation + environment: + - CLUSTER_NAME=test + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + ports: + - 9870:9870 + - 9000:9000 + + datanode1: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode1 + networks: + - hadoop + volumes: + - hadoop_datanode_1:/hadoop/dfs/data + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "namenode:9870" + deploy: + mode: global + datanode2: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode2 + networks: + - hadoop + volumes: + - hadoop_datanode_2:/hadoop/dfs/data + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "namenode:9870" + deploy: + mode: global + +volumes: + hadoop_datanode_1: + hadoop_datanode_2: + hadoop_namenode: + +networks: + hadoop: + + diff --git a/hadoop/docker-compose.yml b/hadoop/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..3620de8a121005ae4612763313c2092e73977ce6 --- /dev/null +++ b/hadoop/docker-compose.yml @@ -0,0 +1,196 @@ +services: + namenode: + image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop + container_name: namenode + volumes: + - hadoop_namenode:/hadoop/dfs/name + - ~/ensiie/exo/files:/data/hdfs/formation + - ~/ensiie/exo/opensearch/bano-data:/data/hdfs/bano-data + environment: + - CLUSTER_NAME=test + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + ports: + - 9870:9870 + - 9000:9000 + + datanode1: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode1 + networks: + - hadoop + volumes: + - hadoop_datanode_1:/hadoop/dfs/data + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "namenode:9870" + deploy: + mode: global + datanode2: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode2 + networks: + - hadoop + volumes: + - hadoop_datanode_2:/hadoop/dfs/data + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "namenode:9870" + deploy: + mode: global + + + resourcemanager: + image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 + container_name: resourcemanager + networks: + - hadoop + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 datanode2:9864" + env_file: + - ./hadoop.env + volumes: + - ~/ensiie/exo/files:/data/yarn/formation + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + healthcheck: + disable: true + nodemanager1: + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + container_name: nodemanager1 + networks: + - hadoop + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: global + nodemanager2: + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + container_name: nodemanager2 + networks: + - hadoop + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode2:9864 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: global + historyserver: + image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 + container_name: historyserver + networks: + - hadoop + volumes: + - hadoop_historyserver:/hadoop/yarn/timeline + environment: + SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 datanode2:9864 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + ports: + - 28188:8188 + + hive-server: + image: bde2020/hive:2.3.2-postgresql-metastore + container_name: hive-server + env_file: + - ./hadoop-hive.env + volumes: + - ~/ensiie/exo/files:/data/hive/formation + environment: + HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore" + SERVICE_PRECONDITION: "hive-metastore:9083" + ports: + - "10000:10000" + networks: + - hadoop + + hive-metastore: + image: bde2020/hive:2.3.2-postgresql-metastore + container_name: hive-metastore + env_file: + - ./hadoop-hive.env + command: /opt/hive/bin/hive --service metastore + environment: + SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode1:9864 datanode2:9864 hive-metastore-postgresql:5432" + ports: + - "9083:9083" + networks: + - hadoop + + hive-metastore-postgresql: + image: bde2020/hive-metastore-postgresql:2.3.0 + container_name: hive-metastore-postgresql + env_file: + - ./hadoop-hive.env + networks: + - hadoop + ports: + - "5432:5432" + + spark-master: + image: bde2020/spark-master:3.1.1-hadoop3.2 + container_name: spark-master + volumes: + - ~/ensiie/exo/files:/data/spark/files + - ~/ensiie/exo/python:/data/spark/python + ports: + - "28083:8080" + - "7077:7077" + environment: + - INIT_DAEMON_STEP=setup_spark + networks: + - hadoop + spark-worker-1: + image: bde2020/spark-worker:3.1.1-hadoop3.2 + container_name: spark-worker-1 + depends_on: + - spark-master + ports: + - "28081:8081" + environment: + - "SPARK_MASTER=spark://spark-master:7077" + networks: + - hadoop + spark-worker-2: + image: bde2020/spark-worker:3.1.1-hadoop3.2 + container_name: spark-worker-2 + depends_on: + - spark-master + ports: + - "28082:8081" + environment: + - "SPARK_MASTER=spark://spark-master:7077" + networks: + - hadoop + +volumes: + hadoop_datanode_1: + hadoop_datanode_2: + hadoop_namenode: + hadoop_historyserver: + +networks: + hadoop: + diff --git a/hadoop/hadoop-16oRAM.env b/hadoop/hadoop-16oRAM.env new file mode 100644 index 0000000000000000000000000000000000000000..2aba2e7a3a8d83d766f412be5aaedda1e746cc1b --- /dev/null +++ b/hadoop/hadoop-16oRAM.env @@ -0,0 +1,43 @@ +CORE_CONF_fs_defaultFS=hdfs://namenode:9000 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* +CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false +HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=4096 +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=2 +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager +YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 +YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 +YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_timeline___service_hostname=historyserver +YARN_CONF_mapreduce_map_output_compress=true +YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec +YARN_CONF_yarn_nodemanager_resource_memory___mb=2048 +YARN_CONF_yarn_nodemanager_resource_cpu___vcores=2 +YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs +YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle + +MAPRED_CONF_mapreduce_framework_name=yarn +MAPRED_CONF_mapred_child_java_opts=-Xmx2048m +MAPRED_CONF_mapreduce_map_memory_mb=2048 +MAPRED_CONF_mapreduce_reduce_memory_mb=1536 +MAPRED_CONF_mapreduce_map_java_opts=-Xmx1536m +MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx1024m +MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ diff --git a/hadoop/hadoop-8goRAM.env b/hadoop/hadoop-8goRAM.env new file mode 100644 index 0000000000000000000000000000000000000000..d1fcc2164340f21b22448aa58a05bdaa9dd5c2af --- /dev/null +++ b/hadoop/hadoop-8goRAM.env @@ -0,0 +1,43 @@ +CORE_CONF_fs_defaultFS=hdfs://namenode:9000 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* +CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false +HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=4096 +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=2 +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager +YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 +YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 +YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_timeline___service_hostname=historyserver +YARN_CONF_mapreduce_map_output_compress=true +YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec +YARN_CONF_yarn_nodemanager_resource_memory___mb=1024 +YARN_CONF_yarn_nodemanager_resource_cpu___vcores=2 +YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs +YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle + +MAPRED_CONF_mapreduce_framework_name=yarn +MAPRED_CONF_mapred_child_java_opts=-Xmx1024m +MAPRED_CONF_mapreduce_map_memory_mb=1024 +MAPRED_CONF_mapreduce_reduce_memory_mb=718 +MAPRED_CONF_mapreduce_map_java_opts=-Xmx718m +MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx512m +MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ diff --git a/hadoop/hadoop-hive.env b/hadoop/hadoop-hive.env new file mode 100644 index 0000000000000000000000000000000000000000..489727e464a1ad035a6a2750e26ed392d69513eb --- /dev/null +++ b/hadoop/hadoop-hive.env @@ -0,0 +1,30 @@ +HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore +HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver +HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive +HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive +HIVE_SITE_CONF_datanucleus_autoCreateSchema=false +HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083 +HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false + +CORE_CONF_fs_defaultFS=hdfs://namenode:9000 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs +YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager +YARN_CONF_yarn_timeline___service_hostname=historyserver +YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 +YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 +YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 diff --git a/hadoop/hadoop.env b/hadoop/hadoop.env new file mode 100644 index 0000000000000000000000000000000000000000..2aba2e7a3a8d83d766f412be5aaedda1e746cc1b --- /dev/null +++ b/hadoop/hadoop.env @@ -0,0 +1,43 @@ +CORE_CONF_fs_defaultFS=hdfs://namenode:9000 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* +CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false +HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=4096 +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=2 +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager +YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 +YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 +YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_timeline___service_hostname=historyserver +YARN_CONF_mapreduce_map_output_compress=true +YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec +YARN_CONF_yarn_nodemanager_resource_memory___mb=2048 +YARN_CONF_yarn_nodemanager_resource_cpu___vcores=2 +YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs +YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle + +MAPRED_CONF_mapreduce_framework_name=yarn +MAPRED_CONF_mapred_child_java_opts=-Xmx2048m +MAPRED_CONF_mapreduce_map_memory_mb=2048 +MAPRED_CONF_mapreduce_reduce_memory_mb=1536 +MAPRED_CONF_mapreduce_map_java_opts=-Xmx1536m +MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx1024m +MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ diff --git a/hadoop/init.sql b/hadoop/init.sql new file mode 100644 index 0000000000000000000000000000000000000000..100d792491f25a8e55f395470c46d79768c7fa49 --- /dev/null +++ b/hadoop/init.sql @@ -0,0 +1 @@ +CREATE DATABASE IF NOT EXISTS hue; diff --git a/hadoop/overrides/namenode/entrypoint.sh b/hadoop/overrides/namenode/entrypoint.sh new file mode 100755 index 0000000000000000000000000000000000000000..0c93764ffc4074cf4b7bd4d9bf3201fa80d82e7a --- /dev/null +++ b/hadoop/overrides/namenode/entrypoint.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# Set some sensible defaults +export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} + +function addProperty() { + local path=$1 + local name=$2 + local value=$3 + + local entry="<property><name>$name</name><value>${value}</value></property>" + local escapedEntry=$(echo $entry | sed 's/\//\\\//g') + sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path +} + +function configure() { + local path=$1 + local module=$2 + local envPrefix=$3 + + local var + local value + + echo "Configuring $module" + for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'` + var="${envPrefix}_${c}" + value=${!var} + echo " - Setting $name=$value" + addProperty /etc/hadoop/$module-site.xml $name "$value" + done +} + +configure /etc/hadoop/core-site.xml core CORE_CONF +configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF +configure /etc/hadoop/yarn-site.xml yarn YARN_CONF +configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF +configure /etc/hadoop/kms-site.xml kms KMS_CONF +configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF + +if [ "$MULTIHOMED_NETWORK" = "1" ]; then + echo "Configuring for multihomed network" + + # HDFS + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true + addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true + + # YARN + addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 + + # MAPRED + addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 +fi + +if [ -n "$GANGLIA_HOST" ]; then + mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig + mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig + + for module in mapred jvm rpc ugi; do + echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" + echo "$module.period=10" + echo "$module.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics.properties + + for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do + echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" + echo "$module.sink.ganglia.period=10" + echo "$module.sink.ganglia.supportsparse=true" + echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" + echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" + echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics2.properties +fi + +function wait_for_it() +{ + local serviceport=$1 + local service=${serviceport%%:*} + local port=${serviceport#*:} + local retry_seconds=5 + local max_try=100 + let i=1 + + nc -z $service $port + result=$? + + until [ $result -eq 0 ]; do + echo "[$i/$max_try] check for ${service}:${port}..." + echo "[$i/$max_try] ${service}:${port} is not available yet" + if (( $i == $max_try )); then + echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/" + exit 1 + fi + + echo "[$i/$max_try] try in ${retry_seconds}s once again ..." + let "i++" + sleep $retry_seconds + + nc -z $service $port + result=$? + done + echo "[$i/$max_try] $service:${port} is available." +} + +for i in ${SERVICE_PRECONDITION[@]} +do + wait_for_it ${i} +done + +# remove problematic package source +sed -i '$ d' /etc/apt/sources.list + +# create user from env +useradd -s /bin/bash -p $(openssl passwd $ADMIN_PASSWORD) $ADMIN_NAME +chown -R $ADMIN_NAME /home/$ADMIN_NAME/ + +# install python +if [[ $INSTALL_PYTHON == "true" ]]; then + apt-get update + echo Y | apt-get install nano python +fi + +# install sqoop +if [[ $INSTALL_SQOOP == "true" ]]; then + + echo "export HADOOP_MAPRED_HOME=/opt/hadoop-3.1.1" >> /root/.bashrc + echo "export HADOOP_COMMON_HOME=/opt/hadoop-3.1.1" >> /root/.bashrc + echo "export HADOOP_HDFS_HOME=/opt/hadoop-3.1.1" >> /root/.bashrc + echo "export YARN_HOME=/opt/hadoop-3.1.1" >> /root/.bashrc + echo "export HADOOP_COMMON_LIB_NATIVE_DIR=/opt/hadoop-3.1.1/lib/native" >> /root/.bashrc + echo "export SQOOP_HOME=/usr/lib/sqoop" >> /root/.bashrc + + echo "export HADOOP_MAPRED_HOME=/opt/hadoop-3.1.1" >> /home/$ADMIN_NAME/.bashrc + echo "export HADOOP_COMMON_HOME=/opt/hadoop-3.1.1" >> /home/$ADMIN_NAME/.bashrc + echo "export HADOOP_HDFS_HOME=/opt/hadoop-3.1.1" >> /home/$ADMIN_NAME/.bashrc + echo "export YARN_HOME=/opt/hadoop-3.1.1" >> /home/$ADMIN_NAME/.bashrc + echo "export HADOOP_COMMON_LIB_NATIVE_DIR=/opt/hadoop-3.1.1/lib/native" >> /home/$ADMIN_NAME/.bashrc + echo "export SQOOP_HOME=/usr/lib/sqoop" >> /home/$ADMIN_NAME/.bashrc + + cd /tmp + + curl http://us.mirrors.quenda.co/apache/sqoop/1.4.7/sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz --output sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz + tar -xvf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz + mv sqoop-1.4.7.bin__hadoop-2.6.0/ /usr/lib/sqoop + echo "export PATH=$PATH:/usr/lib/sqoop/bin" >> /root/.bashrc + echo "export PATH=$PATH:/usr/lib/sqoop/bin" >> /home/$ADMIN_NAME/.bashrc + + curl https://downloads.mysql.com/archives/get/file/mysql-connector-java-8.0.16.tar.gz --output mysql-connector-java-8.0.16.tar.gz + tar -xvf mysql-connector-java-8.0.16.tar.gz + mv mysql-connector-java-8.0.16/mysql-connector-java-8.0.16.jar /usr/lib/sqoop/lib + + curl https://jdbc.postgresql.org/download/postgresql-42.2.6.jar --output postgresql-42.2.6.jar + mv postgresql-42.2.6.jar /usr/lib/sqoop/lib + + mv /usr/lib/sqoop/conf/sqoop-env-template.sh /usr/lib/sqoop/conf/sqoop-env.sh + echo "export HADOOP_COMMON_HOME=/opt/hadoop-3.1.1" >> /usr/lib/sqoop/conf/sqoop-env.sh + echo "export HADOOP_MAPRED_HOME=/opt/hadoop-3.1.1" >> /usr/lib/sqoop/conf/sqoop-env.sh + + rm sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz + rm mysql-connector-java-8.0.16.tar.gz + +fi + +exec $@ diff --git a/hadoop/overrides/nodemanagers/entrypoint.sh b/hadoop/overrides/nodemanagers/entrypoint.sh new file mode 100755 index 0000000000000000000000000000000000000000..b8bce7f244bc79052e2ee91d5305700b3a6854f3 --- /dev/null +++ b/hadoop/overrides/nodemanagers/entrypoint.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +# Set some sensible defaults +export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} + +function addProperty() { + local path=$1 + local name=$2 + local value=$3 + + local entry="<property><name>$name</name><value>${value}</value></property>" + local escapedEntry=$(echo $entry | sed 's/\//\\\//g') + sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path +} + +function configure() { + local path=$1 + local module=$2 + local envPrefix=$3 + + local var + local value + + echo "Configuring $module" + for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'` + var="${envPrefix}_${c}" + value=${!var} + echo " - Setting $name=$value" + addProperty /etc/hadoop/$module-site.xml $name "$value" + done +} + +configure /etc/hadoop/core-site.xml core CORE_CONF +configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF +configure /etc/hadoop/yarn-site.xml yarn YARN_CONF +configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF +configure /etc/hadoop/kms-site.xml kms KMS_CONF +configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF + +if [ "$MULTIHOMED_NETWORK" = "1" ]; then + echo "Configuring for multihomed network" + + # HDFS + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true + addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true + + # YARN + addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 + + # MAPRED + addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 +fi + +if [ -n "$GANGLIA_HOST" ]; then + mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig + mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig + + for module in mapred jvm rpc ugi; do + echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" + echo "$module.period=10" + echo "$module.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics.properties + + for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do + echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" + echo "$module.sink.ganglia.period=10" + echo "$module.sink.ganglia.supportsparse=true" + echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" + echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" + echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics2.properties +fi + +function wait_for_it() +{ + local serviceport=$1 + local service=${serviceport%%:*} + local port=${serviceport#*:} + local retry_seconds=5 + local max_try=100 + let i=1 + + nc -z $service $port + result=$? + + until [ $result -eq 0 ]; do + echo "[$i/$max_try] check for ${service}:${port}..." + echo "[$i/$max_try] ${service}:${port} is not available yet" + if (( $i == $max_try )); then + echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/" + exit 1 + fi + + echo "[$i/$max_try] try in ${retry_seconds}s once again ..." + let "i++" + sleep $retry_seconds + + nc -z $service $port + result=$? + done + echo "[$i/$max_try] $service:${port} is available." +} + +for i in ${SERVICE_PRECONDITION[@]} +do + wait_for_it ${i} +done + +sed -i '$ d' /etc/apt/sources.list + +# INSTALL PYTHON ON NODES +if [[ $INSTALL_PYTHON == "true" ]]; then + apt-get update + echo Y | apt-get install nano python +fi + +exec $@ \ No newline at end of file diff --git a/hadoop/purge-hadoop-ensiie.sh b/hadoop/purge-hadoop-ensiie.sh new file mode 100755 index 0000000000000000000000000000000000000000..b2c9986d5777c1e9ec285b0f2444049233e9ac36 --- /dev/null +++ b/hadoop/purge-hadoop-ensiie.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# +# + +docker compose -f ~/ensiie/exo/hadoop/docker-compose-ensiie-v3.yml down -v +docker volume list | grep hadoop | awk '{ print $2 }' | xargs docker volume rm --force diff --git a/hadoop/start-hadoop-ensiie.sh b/hadoop/start-hadoop-ensiie.sh new file mode 100755 index 0000000000000000000000000000000000000000..6a9da5bedd037bbc85f784acf6f68936a5596355 --- /dev/null +++ b/hadoop/start-hadoop-ensiie.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# +# +# modification 2024 : test de la ram dispo et utilisation de tel ou tel fichier hadoop.env +# + +if [ $(free --giga | grep "^Mem" | awk '{ print $2 }') -lt 8 ] +then cp hadoop-8goRAM.env hadoop.env +fi + +docker network create hbase 2>/dev/null +docker compose -f ~/ensiie/exo/hadoop/docker-compose-ensiie-v3.yml up -d namenode hive-metastore-postgresql +docker compose -f ~/ensiie/exo/hadoop/docker-compose-ensiie-v3.yml up -d datanode1 datanode2 +docker compose -f ~/ensiie/exo/hadoop/docker-compose-ensiie-v3.yml up -d resourcemanager nodemanager1 nodemanager2 historyserver +docker compose -f ~/ensiie/exo/hadoop/docker-compose-ensiie-v3.yml up -d hive-server hive-metastore +docker compose -f ~/ensiie/exo/hadoop/docker-compose-ensiie-v3.yml up -d spark-master spark-worker-1 spark-worker-2 + +my_ip=`ip route get 1 | awk '{ for (i=1;i<=NF;i++) { if ( $i == "src" ) { print $(i+1) ; exit } } }'` +echo "Namenode: (HDFS Filebrowser) http://${my_ip}:9870" +echo "Spark-master: http://${my_ip}:28083" +echo "History Server: http://${my_ip}:28188" diff --git a/hadoop/stop-hadoop-ensiie.sh b/hadoop/stop-hadoop-ensiie.sh new file mode 100755 index 0000000000000000000000000000000000000000..1dab2a9c2dec2aa5a409489999990ee05d3ff387 --- /dev/null +++ b/hadoop/stop-hadoop-ensiie.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# +# +docker compose -f ~/ensiie/exo/hadoop/docker-compose-ensiie-v3.yml stop