
izone/hadoop本镜像基于Hadoop 2.8.5构建,提供完整的大数据处理与分析生态系统。支持多种Linux操作系统(Debian Jessie、CentOS 7、CentOS 6.8及轻量级Alpine Linux),集成了分布式计算、数据存储、ETL工具、机器学习库及开发环境,适用于大数据工程、数据分析、机器学习等场景。
bash# 下载集群管理脚本 curl -L [***] -o ~/zoneCluster.sh alias zoneCluster="bash ~/zoneCluster.sh" # 创建Notebook目录(用于挂载) mkdir $HOME/notebooks
单节点集群(含namenode共2节点):
bashzoneCluster
多节点集群(最多9个节点,含namenode共10节点):
bashzoneCluster 3 # 创建3个节点(总4节点) docker logs -f Hadoop # 查看集群日志
bashzoneCluster { stop | start | remove | Stop | pseudo | cos6 | cos7 | alpine }
stop/start:停止/启动集群remove:移除集群Stop:停止并移除集群pseudo:创建伪分布式实例cos6/cos7/alpine:指定操作系统版本http://localhost:8088http://localhost:50070http://localhost:60010http://localhost:8888/terminals/1(进入后输入bash启动终端)启动含数据库的集群:
bashzoneCluster 2 -db
从MariaDB导入数据:
bash# 在Jupyter终端执行 sqoop import \ --connect jdbc:mysql://mariadb:3306/mysql \ --username root \ --password maria \ --table user -m 1 # 验证HDFS数据 hdfs dfs -ls -R user
从Oracle导入数据:
进入Oracle容器准备数据:
bashdocker exec -ti OracleXE bash cd $HOME/data/ curl -O [***] unzip ml-20m.zip cd ml-20m cat ratings.csv | tail -n $((`cat ratings.csv | wc -l` /100)) > ml_ratings.csv
在Oracle中创建表:
sqlsqlplus sys/oracle as sysdba CREATE USER aluno IDENTIFIED BY dsacademy; GRANT connect, resource, unlimited tablespace TO aluno; CONN aluno@xe/dsacademy CREATE TABLE cinema ( ID NUMBER PRIMARY KEY, USER_ID VARCHAR2(30), MOVIE_ID VARCHAR2(30), RATING DECIMAL(30), TIMESTAMP VARCHAR2(256) );
使用SQL*Loader加载数据:
bashtee $HOME/data/loader.dat <<EOF load data INFILE '$HOME/data/ml-20m/ml_ratings.csv' INTO TABLE cinema APPEND FIELDS TERMINATED BY ',' trailing nullcols (id SEQUENCE (MAX,1), user_id CHAR(30), movie_id CHAR(30), rating decimal external, timestamp char(256)) EOF sqlldr userid=aluno/dsacademy control=$HOME/data/loader.dat log=$HOME/data/loader.log
Sqoop导入Oracle数据:
bash# 在Jupyter终端执行 sqoop import \ --connect jdbc:oracle:thin:@oraclexe:1521:XE \ --username aluno \ --password dsacademy \ --query "select user_id, movie_id from cinema where rating = 1 and \$CONDITIONS" \ --target-dir /user/oracle/output -m 1
####### 4.2.2 Hive结构化数据处理
准备数据:
bash# 下载数据集并上传至HDFS curl -O [***] hdfs dfs -mkdir /hive hdfs dfs -copyFromLocal empregados.csv /hive
初始化Hive元数据:
bashschematool -initSchema -dbType derby # 若初始化失败,清理元数据后重试 rm metastore_db -fR
Hive表操作:
sqlhive # 启动Hive -- 创建临时表 CREATE TABLE temp_colab (texto String); -- 加载数据 LOAD DATA INPATH '/hive/empregados.csv' OVERWRITE INTO TABLE temp_colab; -- 创建结构化表并提取数据 CREATE TABLE IF NOT EXISTS colaboradores( id int, nome String, cargo String, salario double, cidade String ); INSERT OVERWRITE TABLE colaboradores SELECT regexp_extract(texto, '^(?:([^,]*),?){1}', 1) ID, regexp_extract(texto, '^(?:([^,]*),?){2}', 1) nome, regexp_extract(texto, '^(?:([^,]*),?){3}', 1) cargo, regexp_extract(texto, '^(?:([^,]*),?){4}', 1) salario, regexp_extract(texto, '^(?:([^,]*),?){5}', 1) cidade FROM temp_colab;
HiveQL查询示例:
sqlSELECT * FROM colaboradores; SELECT * FROM colaboradores WHERE Id = 3002; SELECT sum(salario), cidade FROM colaboradores GROUP BY cidade;
准备数据:
bash# 创建HDFS目录 hdfs dfs -mkdir -p /mahout/input/{ham,spam} # 下载并上传数据集 curl [***] | tar -xzf - curl [***] | tar -xzf - hdfs dfs -copyFromLocal ham/* /mahout/input/ham hdfs dfs -copyFromLocal spam/* /mahout/input/spam
数据转换与模型训练:
bash# 转换为序列文件 mahout seqdirectory -i /mahout/input -o /mahout/output/seqoutput # 转换为TF-IDF向量 mahout seq2sparse -i /mahout/output/seqoutput -o /mahout/output/sparseoutput # 划分训练集和测试集 mahout split -i /mahout/output/sparseoutput/tfidf-vectors --trainingOutput /mahout/nbTrain --testOutput /mahout/nbTest --randomSelectionPct 30 --overwrite --sequenceFiles -xm sequencial # 训练模型 mahout trainnb -i /mahout/nbTrain -li /mahout/nbLabels -o /mahout/nbmodel -ow -c # 测试模型 mahout testnb -i /mahout/nbTest -m /mahout/nbmodel -l /mahout/nbLabels -ow -o /mahout/nbpredictions -c
准备数据:
bash# 创建HDFS目录 hdfs dfs -mkdir -p /mahout/clustering/data # 下载并上传数据集 curl [***] | tar -xzf - hdfs dfs -copyFromLocal news/* /mahout/clustering/data
数据转换与模型训练:
bash# 转换为序列文件 mahout seqdirectory -i /mahout/clustering/data -o /mahout/clustering/kmeansseq # 转换为TF-IDF向量 mahout seq2sparse -i /mahout/clustering/kmeansseq -o /mahout/clustering/kmeanssparse # 训练K-Means模型(k=3,迭代10次) mahout kmeans -i /mahout/clustering/kmeanssparse/tfidf-vectors/ -c /mahout/clustering/kmeanscentroids -cl -o /mahout/clustering/kmeansclusters -k 3 -ow -x 10 -dm org.apache.mahout.common.distance.CosineDistanceMeasure # 导出聚类结果 mahout clusterdump -d /mahout/clustering/kmeanssparse/dictionary.file-0 -dt sequencefile -i /mahout/clustering/kmeansclusters/clusters-1-final -n 20 -b 100 -o clusterdump.txt -p /mahout/clustering/kmeansclusters/clusteredPoints/ # 查看结果 cat clusterdump.txt
http://localhost:8888http://localhost:4040http://localhost:8787root,密码rootbash# 在Jupyter终端执行 curl -O [***] curl -O [***] julia multilinreg.jl
bashzoneCluster pseudo
或直接执行Docker命令:
bashdocker run --rm --name Hadoop -h hadoop \ -p 8088:8088 -p 8042:8042 -p 50070:50070 -p 8888:8888 -p 4040:4040 \ -v $HOME/notebooks:/root/notebooks \ -ti izone/hadoop:ecosystem bash
bash# 创建HDFS目录 hdfs dfs -mkdir /bigdata # 下载并上传文件 wget -c [***] hadoop fs -copyFromLocal contratos.csv /bigdata # 运行WordCount任务 hadoop jar /opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.5.jar wordcount /bigdata/contratos.csv /output # 查看结果 hdfs dfs -cat /output/*
在Jupyter Notebook中执行:
python# 终端命令(以!开头) !mkdir datasets !curl -L [***] -o datasets/book.txt !hdfs dfs -mkdir -p /spark/input !hdfs dfs -put datasets/book.txt /spark/input # Spark代码 text_file = sc.textFile("hdfs://localhost:9000/spark/input/book.txt") counts = text_file.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) counts.saveAsTextFile("hdfs://localhost:9000/spark/output") # 查看结果 !hdfs dfs -ls /spark/output !hdfs dfs -cat /spark/output/part-00000
bashdocker pull izone/hadoop # 运行(可选-test参数执行PI测试) docker run --rm --name Hadoop -h hadoop \ -p 8088:8088 -p 8042:8042 -p 50070:50070 \ -ti izone/hadoop -test bash
docker pull izone/hadoop:cos7docker pull izone/hadoop:cos6docker pull izone/hadoop:alpineAnaconda集成:
bashdocker run --rm --name Hadoop -h hadoop \ -p 8088:8088 -p 8042:8042 -p 50070:50070 -p 8888:8888 -p 4040:4040 \ -v $HOME/notebooks:/root/notebooks \ -ti izone/hadoop:anaconda bash
RStudio集成:
bashdocker run --rm --name Hadoop -h hadoop \ -p 8088:8088 -p 8042:8042 -p 50070:50070 -p 8888:8888 -p 4040:4040 -p 8787:8787 \ -v $HOME/notebooks:/root/notebooks \ -ti izone/hadoop:rstudio bash
bashgit clone [***] cd hadoop docker build -t izone/hadoop . && \ docker build -t izone/hadoop:anaconda ./anaconda/ && \ docker build -t izone/hadoop:rstudio ./rstudio/ && \ docker build -t izone/hadoop:julia ./julia/ && \ docker build -t izone/hadoop:ecosystem ./ecosystem/ && \ docker build -t izone/hadoop:cluster ./cluster/ && \ docker build -t izone/hadoop:datanode ./cluster/datanode/




manifest unknown 错误
TLS 证书验证失败
DNS 解析超时
410 错误:版本过低
402 错误:流量耗尽
身份认证失败错误
429 限流错误
凭证保存错误
来自真实用户的反馈,见证轩辕镜像的优质服务