内容简介:本文风哥教程参考Linux官方文档、Red Hat Enterprise Linux官方文档、Ansible Automation Platform官方文档、Docker官方文档、Kubernetes官方文档和Podman官方文档等内容,详细介绍了相关技术的配置和使用方法。
<
风哥提示:
p>本文档介绍大数据平台部署综合实战案例。
Part01-Hadoop集群部署
1.1 Hadoop安装配置
[root@fgedu-hadoop ~]# yum install -y java-1.8.0-openjdk java-1.8.0-openjdk-devel
# 下载Hadoop
[root@fgedu-hadoop ~]# wget https://downloads.apache.org/hadoop/common/hadoop-3.from PG视频:www.itpux.com3.6/hadoop-3.3.6.tar.gz
[root@fgedu-hadoop ~]# tar xzf hadoop-3.3.6.tar.gz -C /usr/local/
[root@fgedu-hadoop ~]# ln -s /usr/local/hadoop-3.3.6 /usr/local/hadoop
# 配置环境变量
[root@fgedu-hadoop ~]# cat >> /etc/profile << 'EOF'
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
EOF
[root@fgedu-hadoop ~]# source /etc/profile
# 配置core-site.xml
[root@fgedu-hadoop ~]# cat > $HADOOP_HOME/etc/hadoop/core-site.xml << 'EOF'
EOF
# 配置hdfs-site.xml
[root@fgedu-hadoop ~]# cat > $HADOOP_HOME/etc/hadoop/hdfs-site.xml << 'EOF'
EOF
# 格式化NameNode
[root@fgedu-hadoop ~]# hdfs namenode -format
2026-04-04 23:00:00,更多学习教程公众号风哥教程itpux_com000 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG: host = fgedu-namenode/192.168.1.100
STARTUP_MSG: args = [-format]
************************************************************/
…
2026-04-04 23:00:00,000 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at fgedu-namenode/192.168.1.100
************************************************************/
# 启动HDFS
[root@fgedu-hadoop ~]# start-dfs.sh
Starting namenodes on [fgedu-namenode]
Starting datanodes
Starting secondary namenodes [fgedu-secondary]
# 查看进程
[root@fgedu-hadoop ~]# jps
12345 NameNode
12567 SecondaryNameNode
12890 Jps
Part02-Spark集群部署
2.1 Spark安装配置
[root@fgedu-spark ~]# wget https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
[root@fgedu-spark ~]# tar xzf spark-3.5.0-bin-hadoop3.tgz -C /usr/local/
[root@fgedu-spark ~]# ln -s /usr/local/spark-3.5.0-bin-hadoop3 /usr/local/spark
# 配置环境变量
[root@fgedu-spark ~]# cat >> /etc/profile << 'EOF'
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
EOF
# 配置spark-env.sh
[root@fgedu-spark ~]# cat > $SPARK_HOME/conf/spark-env.sh << 'EOF'
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export SPARK_MASTER_HOST=fgedu-spark-master
export SPARK_MASTER_PORT=7077
export SPARK_WORKER_CORES=4
export SPARK_WORKER_MEMORY=8g
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
EOF
# 配置slaves
[root@fgedu-spark ~]# cat > $SPARK_HOME/conf/slaves << 'EOF'
fgedu-spark-worker1
fgedu-spark-worker2
fgedu-spark-worker3
EOF
# 启动Spark集群
[root@fgedu-spark ~]# start-all.sh
starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.master.Master-1-fgedu-spark-master.out
fgedu-spark-worker1: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-fgedu-spark-worker1.out
fgedu-spark-worker2: starting org.apache.spark.学习交流加群风哥微信: itpux-comdeploy.worker.Worker, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-fgedu-spark-worker2.out
fgedu-spark-worker3: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-fgedu-spark-worker3.out
# 查看Spark Master Web UI
[root@fgedu-spark ~]# curl -s http://localhost:8080 | head -20
Part03-Hive数据仓库
3.1 Hive安装配置
[root@fgedu-hive ~]# wget https://downloads.apache.org/hive/hive-3.1.3/apache-hive-3.1.3-bin.tar.gz
[root@fgedu-hive ~]# tar xzf apache-hive-3.1.3-bin.tar.gz -C /usr/local/
[root@fgedu-hive ~]# ln -s /usr/local/apache-hive-3.1.3-bin /usr/local/hive
# 配置环境变量
[root@fgedu-hive ~]# cat >> /etc/profile << 'EOF'
export HIVE_HOME=/usr/local/hive
export PATH=$PATH:$HIVE_HOME/bin
EOF
# 配置hive-site.xml
[root@fgedu-hive ~]# cat > $HIVE_HOME/conf/hive-site.xml << 'EOF'
EOF
# 初始化Hive元数据
[root@fgedu-hive ~]# schematool -dbType mysql -initSchema
Metastore connection URL: jdbc:mysql://fgedu-mysql:3306/hive
Metastore Connection Driver : com.mysql.cj.jdbc.Driver
Metastore connection User: hive
Starting metastore schema initialization to 3.1.0
Initialization script hive-schema-3.1.0.mysql.sql
Initialization script completed
schemaTool completed
# 创建测试表
[root@fgedu-hive ~]# hive
hive> CREATE TABLE fgedu_users (
> id INT,
> name STRING,
> email STRING
> )
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY ‘,’;
OK
Time taken: 2.5 seconds
hive> SHOW TABLES;
OK
fgedu_users
Time taken: 0.5 seconds
Part04-数据导入导出
4.1 数据处理实战
[root@fgedu-spark ~]# spark-shell
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ ‘_/
/___/ .__/\_,_/_/ /_/\_\ version 3.5.0
/_/
scala> val df = spark.read.csv(“hdfs://fgedu-namenode:9000/data/users.csv”)
df: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string … 1 more field]
scala> df.show()
+—+—-+——————+
|_c0| _c1| _c2|
+—+—-+——————+
| 1|风哥1号|zhangsan@fgedu.cn|
| 2|风哥2号|lisi@fgedu.net.cn|
| 3|王五|wangwu@fgedu.cn|
+—+—-+——————+
scala> df.更多视频教程www.fgedu.net.cnwrite.parquet(“hdfs://fgedu-namenode:9000/output/users_parquet”)
scala> :quit
# 使用Sqoop导入数据
[root@fgedu-hive ~]# sqoop import \
–connect jdbc:mysql://fgedu-mysql:3306/fgedu_db \
–username root \
–password MySQL@123 \
–table users \
–hive-import \
–hive-table fgedu_users \
–m 1
# 创建数据分析脚本
[root@fgedu-hive ~]# cat > /usr/local/bin/data-analysis.sh << 'EOF'
#!/bin/bash
# data-analysis.sh
# from:www.itpux.com.qq113257174.wx:itpux-com
# web: http://www.fgedu.net.cn
echo "===学习交流加群风哥QQ113257174 数据分析任务 ==="
echo "执行时间: $(date)"
echo "1. HDFS状态"
hdfs dfsadmin -report | head -10
echo ""
echo "2. Spark任务状态"
curl -s http://localhost:8080/api/v1/applications | jq -r '.[] | "\(.name): \(.attempts[0].completed)"'
echo ""
echo "3. Hive表统计"
hive -e "SHOW TABLES;" | while read table; do
count=$(hive -e "SELECT COUNT(*) FROM $table;" 2>/dev/null | tail -1)
echo “$table: $count 行”
done
echo “=== 分析完成 ===”
EOF
[root@fgedu-hive ~]# chmod +x /usr/local/bin/data-analysis.sh
- 合理规划集群资源
- 配置数据高可用
- 优化数据处理性能
- 建立数据治理规范
- 配置监控告警
本文由风哥教程整理发布,仅用于学习测试使用,转载注明出处:http://www.fgedu.net.cn/10327.html
