本站总访问量 hive数据库搭建 - Jerry的小站

Jerry Gao

上帝就是真理,真理就是上帝

Hive是一个基于Hadoop的数据仓库工具,可以将结构化的数据文件映射为一张数据库表,并提供完整的SQL查询功能,可以将SQL语句转换为MapReduce任务进行运行。

构建镜像

目录结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
.
├── Dockerfile
├── bin
│   └── entrypoint.sh
├── conf
│   ├── hadoop
│   │   ├── core-site.xml
│   │   ├── hdfs-site.xml
│   │   ├── mapred-site.xml
│   │   └── yarn-site.xml
│   └── hive
│   └── hive-site.xml
├── docker-entrypoint.sh
├── gz
│   ├── apache-hive-3.1.2-bin.tar.gz
│   └── mysql-connector-j-8.0.32.tar.gz
└── tag
  1. 资源下载
1
2
3
4
# apache-hive-3.1.2-bin.tar.gz
wget https://downloads.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
# mysql-connector-java-8.0.32.tar.gz
wget https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-j-8.0.32.tar.gz

放入相应的目录

  1. bin/entrypoint.sh
1
cp /opt/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar /opt/hive/lib/guava-27.0-jre.jar
  1. conf/hadoop/*.xml

3.1 core-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://0.0.0.0:9000</value>
</property>
<property>
<name>hadoop.proxyuser.USER.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.USER.groups</name>
<value>*</value>
</property>
</configuration>

3.2 hdfs-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/hadoop/name</value>
</property>
<property>
<name>dfs.datanode.address</name>
<value>[your-ip]:9866</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/hadoop/data</value>
</property>
</configuration>
  • [your-ip]: 你的IP地址

3.3 mapred-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/lib/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*</value>
</property>
</configuration>

3.4 yarn-site.xml

1
2
3
4
5
6
7
8
9
10
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
  1. conf/hive/hive-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>MYSQL_URI</value>
<description>
JDBC connect string for a JDBC metastore.
To use SSL to encrypt/authenticate the connection, provide database-specific SSL flag in the connection URL.
For example, jdbc:postgresql://myhost/db?ssl=true for postgres database.
</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>USER</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>PASS</value>
<description>password to use against metastore database</description>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/hive/warehouse</value>
<description>hive default warehouse, if nessecory, change it</description>
</property>
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.enforce.bucketing</name>
<value>true</value>
</property>
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>
<property>
<name>hive.compactor.worker.threads</name>
<value>1</value>
</property>
<property>
<name>hive.in.test</name>
<value>false</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
</configuration>
  1. docker-entrypoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/bin/bash
if [[ -z $MYSQL_URI ]]; then
echo "Env MYSQL_URI must exists."
exit 1
fi

if [[ -z $USER ]]; then
echo "Env USER must exists."
exit 1
fi

if [[ -z $PASS ]]; then
echo "Env PASS must exists."
exit 1
fi

sed -i 's|MYSQL_URI|'"$MYSQL_URI"'|g' /opt/hive/conf/hive-site.xml
sed -i 's|USER|'"$USER"'|g' /opt/hive/conf/hive-site.xml
sed -i 's|PASS|'"$PASS"'|g' /opt/hive/conf/hive-site.xml
sed -i 's|USER|'"$USER"'|g' /opt/hadoop/etc/hadoop/core-site.xml

hdfs namenode -format
start-all.sh
hiveserver2
  1. Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
FROM ubuntu:20.04

USER root

COPY gz/apache-hive-3.1.2-bin.tar.gz /gz/apache-hive-3.1.2-bin.tar.gz

RUN apt-get update && apt-get install -y wget && \
DEBIAN_FRONTEND=noninteractive apt-get install -y ssh && \
wget "http://fileserver.tapdata.io/$/YMfp7" -O java-8.tar.gz && \
mkdir /usr/java && \
tar -xf java-8.tar.gz -C /usr/java && \
rm -rf java-8.tar.gz && \
update-alternatives --install "/usr/bin/java" "java" "/usr/java/jdk1.8.0_381/bin/java" 1 && \
update-alternatives --set java /usr/java/jdk1.8.0_381/bin/java && \
apt-get install locales libc6-i386 -y && locale-gen en_US.UTF-8

RUN tar -xzf /gz/apache-hive-3.1.2-bin.tar.gz -C /opt/ && \
mv /opt/apache-hive-3.1.2-bin/ /opt/hive/ && \
rm -rf /gz/apache-hive-3.1.2-bin.tar.gz

RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.2.4/hadoop-3.2.4.tar.gz && \
tar -xvf hadoop-3.2.4.tar.gz -C /opt/ && \
mv /opt/hadoop-3.2.4/ /opt/hadoop/

COPY gz/mysql-connector-j-8.0.32.tar.gz /gz/mysql-connector-j-8.0.32.tar.gz

RUN rm -rf /opt/hive/lib/guava-19.0.jar && \
cp /opt/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar /opt/hive/lib/guava-27.0-jre.jar && \
tar -xvf /gz/mysql-connector-j-8.0.32.tar.gz -C /gz/ && \
cp /gz/mysql-connector-j-8.0.32/mysql-connector-j-8.0.32.jar /opt/hive/lib/ && \
rm -rf /gz/mysql-connector-j-8.0.32 /gz/mysql-connector-j-8.0.32.tar.gz && \
echo 'JAVA_HOME="/usr/java/jdk1.8.0_381/"' >> /etc/profile

echo 'export HADOOP_HOME="/opt/hadoop"' >> /etc/profile
echo 'export HIVE_HOME="/opt/hive"' >> /etc/profile
echo 'export PATH="$HADOOP_HOME/bin:$HADOOP_HOME/sbin:/usr/java/jdk1.8.0_381/bin:$PATH:$HIVE_HOME/bin"' >> /etc/profile
echo 'export JAVA_TOOL_OPTIONS="-Dfile.encoding=UTF8"' >> /etc/profile
echo 'export JAVA_HOME="/usr/java/jdk1.8.0_381/bin"' >> /etc/profile
echo 'export LC_ALL="en_US.UTF-8"' >> /etc/profile
echo 'export HDFS_NAMENODE_USER="root"' >> /etc/profile
echo 'export HDFS_DATANODE_USER="root"' >> /etc/profile
echo 'export HDFS_SECONDARYNAMENODE_USER="root"' >> /etc/profile
echo 'export YARN_RESOURCEMANAGER_USER="root"' >> /etc/profile
echo 'export YARN_NODEMANAGER_USER="root"' >> /etc/profile

COPY conf/hive/hive-site.xml /opt/hive/conf/hive-site.xml
COPY conf/hadoop /opt/hadoop/etc/hadoop/
COPY docker-entrypoint.sh /bin/docker-entrypoint.sh

RUN cp /opt/hive/conf/hive-default.xml.template /opt/hive/conf/hive-default.xml && \
mkdir -p /opt/hadoop/name /opt/hadoop/data && \
ssh-keygen -t rsa -f ~/.ssh/id_rsa && \
rm -rf /hadoop-3.2.4.tar.gz && \
touch ~/.ssh/authorized_keys && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
echo 'JAVA_HOME="/usr/java/jdk1.8.0_381"' >> /opt/hadoop/etc/hadoop/hadoop-env.sh && \
chmod +x /bin/docker-entrypoint.sh && \
service ssh start

CMD ["/bin/docker-entrypoint.sh"]
  1. tag
1
你的docker tag
  1. 构建
1
docker build -t `cat tag` .

kubernetes 部署

  1. 创建 configmap
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
kind: ConfigMap
apiVersion: v1
metadata:
name: hive-config
namespace: stable-db
data:
mapred-site.xml: |-
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/lib/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*</value>
</property>
</configuration>
yarn-site.xml: |-
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>

方便修改配置

配置说明:

  • mapred-site.xml
    • mapreduce.framework.name:指定 MapReduce 框架为 YARN
    • yarn.app.mapreduce.am.env:指定 MapReduce ApplicationMaster 运行环境
    • mapreduce.map.env:指定 MapReduce MapTask 运行环境
    • mapreduce.reduce.env:指定 MapReduce ReduceTask 运行环境
    • mapreduce.application.classpath:指定 MapReduce 运行时的类路径
  • yarn-site.xml
    • yarn.resourcemanager.hostname:指定 ResourceManager 的主机名
    • yarn.nodemanager.aux-services:指定 NodeManager 的辅助服务
  1. 创建 StatefulSet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
kind: StatefulSet
apiVersion: apps/v1
metadata:
labels:
app: hive
name: hive
namespace: stable-db
spec:
replicas: 1
selector:
matchLabels:
app: hive
template:
metadata:
labels:
app: hive
spec:
volumes:
- name: hive-config
configMap:
name: hive-config
items:
- key: mapred-site.xml
path: mapred-site.xml
- key: yarn-site.xml
path: yarn-site.xml
defaultMode: 420
containers:
- name: hive
image: [你的镜像地址]
ports:
- containerPort: 10000
name: hive
protocol: TCP
- containerPort: 9000
name: hdfs
protocol: TCP
- containerPort: 8088
name: resourcemanager
protocol: TCP
- containerPort: 8042
protocol: TCP
name: node-port
env:
- name: USER
value: <用户名>
- name: PASS
value: <密码>
- name: MYSQL_URI
value: >-
jdbc:<MYSQL_URI>?relaxAutoCommit=true
- name: JAVA_OPTS
value: '-Xms1024m -Xmx4096m' <可以根据需要修改>
resources:
limits:
cpu: '2'
memory: 4Gi
ephemeral-storage: 10Gi
requests:
cpu: '2'
memory: 4Gi
ephemeral-storage: 10Gi
livenessProbe:
tcpSocket:
port: 10000
initialDelaySeconds: 120
timeoutSeconds: 1
periodSeconds: 10
successThreshold: 1
failureThreshold: 10
readinessProbe:
tcpSocket:
port: 10000
initialDelaySeconds: 120
timeoutSeconds: 1
periodSeconds: 10
successThreshold: 1
failureThreshold: 10
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /opt/hadoop/etc/hadoop/mapred-site.xml
name: hive-config
subPath: mapred-site.xml
- mountPath: /opt/hadoop/etc/hadoop/yarn-site.xml
name: hive-config
subPath: yarn-site.xml

端口:

  • 10000:hive 服务端口
  • 9000:hdfs 服务端口
  • 8088:yarn 服务端口
  • 8042:node 服务端口

环境变量:

  • USER:hive 用户名
  • PASS:hive 密码
  • MYSQL_URI:mysql 地址
  1. 创建 Service
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
apiVersion: v1
kind: Service
metadata:
name: hive
namespace: stable-db
annotations:
cloud.google.com/l4-rbs: "enabled"
labels:
app: hive
spec:
loadBalancerIP: <your-ip>
type: LoadBalancer
externalTrafficPolicy: Cluster
ports:
- port: <your-port>
nodePort: <your-port>
targetPort: 10000
protocol: TCP
name: hive
- port: <your-port>
nodePort: <your-port>
targetPort: 9870
protocol: TCP
name: hive-web
- port: <your-port>
nodePort: <your-port>
targetPort: 9000
protocol: TCP
name: hive-metastore
- port: <your-port>
nodePort: <your-port>
targetPort: 8088
protocol: TCP
name: hive-resource-manager
- port: <your-port>
nodePort: <your-port>
targetPort: 8042
protocol: TCP
name: hive-node-manager
selector:
app: hive

docker-compose 部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
version: '3'
services:
hive:
image: [ 你的镜像地址 ]
container_name: hive
ports:
- "10000:10000"
- "9000:9000"
- "8088:8088"
- "8042:8042"
deploy:
replicas: 1
resources:
limits:
cpus: '2'
memory: 4G
reservations:
cpus: '2'
memory: 4G
environment:
- USER=<用户名>
- PASS=<密码>
- MYSQL_URI=jdbc:<MYSQL_URI>?relaxAutoCommit=true
- JAVA_OPTS=-Xms1024m -Xmx4096m
volumes:
- ./conf/hadoop/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
- ./conf/hadoop/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
- /data/hive/warehouse:/hive/warehouse
- /data/hadoop/data:/opt/hadoop/data
- /data/hadoop/name:/opt/hadoop/name
- /log/hadoop/logs:/opt/hadoop/logs
- /log/hive/logs:/opt/hive/logs
restart: always

可能的问题

  1. FAILED: HiveException java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient

解决:

1
schematool -dbType mysql -initSchema

评论