Jupyter Labでインタラクティブなコンピューティング環境を提供する事ができます。AmbariでHDFS/Hive/Sparkをインストールした1ノードクラスタを構築します。
pysparkを使用して、JupyterからSparkに接続します。
〇Jupyter Labの画面
〇構築方法
以下のVagrantfileで、Jupyter LabとHiveをインストールした仮想マシン(Ubuntu16.04)を構築できます。
Vagrantfile
VAGRANTFILE_API_VERSION = "2"
Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
config.vm.box = "bento/ubuntu-16.04"
config.vm.hostname = "ub1604jupyterspark.vm.internal"
config.vm.provider :virtualbox do |vbox|
vbox.name = "ub1604jupyterspark.vm.internal"
vbox.cpus = 4
vbox.memory = 10240
vbox.customize ["modifyvm", :id, "--nicpromisc2","allow-all"]
end
# private network
config.vm.network "private_network", ip: "192.168.55.117", :netmask => "255.255.255.0"
# bridge netwrok
config.vm.network "public_network", ip: "192.168.1.117", :netmask => "255.255.255.0"
config.vm.provision "shell", inline: <<-SHELL
echo "192.168.55.117 ub1604jupyterspark" >> /etc/hosts
apt-get -y install curl
cd /root
mkdir ./.ssh
ssh-keygen -f ./.ssh/id_rsa -t rsa -N ''
# copy private key
cp -f ./.ssh/id_rsa /vagrant
cat ./.ssh/id_rsa.pub >> ./.ssh/authorized_keys
chmod 600 ./.ssh/authorized_keys
# install and configure ambari server
wget -O /etc/apt/sources.list.d/ambari.list http://public-repo-1.hortonworks.com/ambari/ubuntu16/2.x/updates/2.6.2.0/ambari.list
apt-key adv --recv-keys --keyserver keyserver.ubuntu.com B9733A7A07513CAD
apt-get update
# install postgresql
apt-get -y install postgresql
echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf
sed -i 's/host.*all.*all.*127.0.0.1/#host all all 127.0.0.1/g' /etc/postgresql/9.5/main/pg_hba.conf
echo "host all all 127.0.0.1/32 password" >> /etc/postgresql/9.5/main/pg_hba.conf
echo "host all all 192.168.1.0/24 password" >> /etc/postgresql/9.5/main/pg_hba.conf
echo "host all all 192.168.55.0/24 password" >> /etc/postgresql/9.5/main/pg_hba.conf
echo "host all hive 127.0.0.1/32 password" >> /etc/postgresql/9.5/main/pg_hba.conf
# create hive database and hive user
su - postgres << EOF
createdb -T template0 --encoding=UTF8 ambari
createdb -T template0 --encoding=UTF8 hive
psql -c "
alter user postgres with password 'postgres';
create user ambari with password 'ambari';
grant all privileges on database ambari to ambari;
create user hive with password 'hive';
grant all privileges on database hive to hive;
"
EOF
echo "postgres:postgres" | chpasswd
systemctl restart postgresql.service
# install jdbc driver for postgresql
wget https://jdbc.postgresql.org/download/postgresql-42.2.2.jar
mkdir -p /opt/jdbc
cp postgresql-42.2.2.jar /opt/jdbc/postgresql-jdbc.jar
chmod 644 /opt/jdbc/postgresql-jdbc.jar
# install ambari
apt-get -y install ambari-server ambari-agent ambari-metrics-assembly
ambari-server setup -s --database=postgres --databasehost=localhost --databaseport=5432 --databasename=ambari --databaseusername=ambari --databasepassword=ambari --jdbc-db=postgres --jdbc-driver=/opt/jdbc/postgresql-jdbc.jar
ambari-server setup --silent
ambari-server start
ambari-agent start
cat << EOF > /home/vagrant/cluster_configuration.json
{
"configurations" : [
{
"hive-site": {
"hive.support.concurrency": "true",
"hive.txn.manager": "org.apache.hadoop.hive.ql.lockmgr.DbTxnManager",
"hive.compactor.initiator.on": "true",
"hive.compactor.worker.threads": "5",
"javax.jdo.option.ConnectionDriverName": "org.postgresql.Driver",
"javax.jdo.option.ConnectionPassword": "hive",
"javax.jdo.option.ConnectionURL": "jdbc:postgresql://localhost/hive",
"javax.jdo.option.ConnectionUserName": "hive"
}
},
{
"hive-env": {
"hive_ambari_database": "PostgreSQL",
"hive_database": "Existing PostgreSQL Database",
"hive_database_type": "postgres",
"hive_database_name": "hive"
}
},
{
"core-site": {
"properties" : {
"hadoop.proxyuser.root.groups" : "*",
"hadoop.proxyuser.root.hosts" : "*"
}
}
}
],
"host_groups" : [
{
"name" : "host_group_1",
"components" : [
{
"name" : "NAMENODE"
},
{
"name" : "SECONDARY_NAMENODE"
},
{
"name" : "DATANODE"
},
{
"name" : "HDFS_CLIENT"
},
{
"name" : "RESOURCEMANAGER"
},
{
"name" : "NODEMANAGER"
},
{
"name" : "YARN_CLIENT"
},
{
"name" : "HISTORYSERVER"
},
{
"name" : "APP_TIMELINE_SERVER"
},
{
"name" : "ZOOKEEPER_SERVER"
},
{
"name" : "ZOOKEEPER_CLIENT"
},
{
"name" : "METRICS_MONITOR"
},
{
"name" : "TEZ_CLIENT"
},
{
"name" : "HIVE_SERVER"
},
{
"name" : "HIVE_METASTORE"
},
{
"name" : "METRICS_COLLECTOR"
},
{
"name" : "WEBHCAT_SERVER"
},
{
"name" : "PIG"
},
{
"name" : "SLIDER"
},
{
"name" : "SPARK2_THRIFTSERVER"
},
{
"name" : "SPARK2_CLIENT"
},
{
"name" : "SPARK2_JOBHISTORYSERVER"
}
],
"cardinality" : "1"
}
],
"settings" : [{
"recovery_settings" : [{
"recovery_enabled" : "true"
}]
}],
"Blueprints" : {
"blueprint_name" : "hdp26-jupyter-spark",
"stack_name" : "HDP",
"stack_version" : "2.6"
}
}
EOF
curl -H "X-Requested-By: ambari" -X POST -u admin:admin http://localhost:8080/api/v1/blueprints/hdp26-jupyter-spark -d @/home/vagrant/cluster_configuration.json
cat << EOF > /home/vagrant/hostmapping.json
{
"blueprint" : "hdp26-jupyter-spark",
"default_password" : "admin",
"provision_action" : "INSTALL_AND_START",
"host_groups" :[
{
"name" : "host_group_1",
"hosts" : [
{
"fqdn" : "ub1604jupyterspark.vm.internal"
}
]
}
]
}
EOF
curl -H "X-Requested-By: ambari" -X POST -u admin:admin http://localhost:8080/api/v1/clusters/hdp26-jupyter-spark -d @/home/vagrant/hostmapping.json
sleep 30
# wait until the cluster is ready.
ProgressPercent=`curl -s --user admin:admin -X GET http://localhost:8080/api/v1/clusters/hdp26-jupyter-spark/requests/1 | grep progress_percent | awk '{print $3}' | cut -d . -f 1`
while [[ `echo $ProgressPercent | grep -v 100` ]]; do
ProgressPercent=`curl -s --user admin:admin -X GET http://localhost:8080/api/v1/clusters/hdp26-jupyter-spark/requests/1 | grep progress_percent | awk '{print $3}' | cut -d . -f 1`
echo " Progress: $ProgressPercent %"
sleep 10
done
cat << EOF > /home/vagrant/shutdown_components.sh
#!/bin/bash
#stop all services
curl -u admin:admin -i -H 'X-Requested-By: ambari' -X PUT \
-d '{"RequestInfo":{"context":"_PARSE_.STOP.ALL_SERVICES","operation_level":{"level":"CLUSTER","cluster_name":"hdp26-minnimal-hive"}},"Body":{"ServiceInfo":{"state":"INSTALLED"}}}' \
http://localhost:8080/api/v1/clusters/hdp26-jupyter-spark/services
EOF
chmod +x /home/vagrant/shutdown_components.sh
cat << EOF > /tmp/test.csv
100,15000000
200,20000000
300,18000000
EOF
cat << EOF > /tmp/sample.sql
CREATE EXTERNAL TABLE sample (
store_id INT,
sales INT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = '"'
)
stored as textfile
LOCATION '/user/hive';
LOAD DATA LOCAL INPATH '/tmp/test.csv' OVERWRITE INTO TABLE sample;
select * from sample;
EOF
# upload sample content
sudo -u hive hdfs dfs -put /tmp/sample.sql /user/hive
sudo -u hive hdfs dfs -ls /user/hive
# create table and select
beeline -u 'jdbc:hive2://localhost:10016/' -n hive -f /tmp/sample.sql
# install anaconda & jupyterlab
wget https://repo.continuum.io/archive/Anaconda3-5.1.0-Linux-x86_64.sh
chmod +x Anaconda3-5.1.0-Linux-x86_64.sh
./Anaconda3-5.1.0-Linux-x86_64.sh -b -p /opt/anaconda
source /opt/anaconda/bin/activate
pip install --upgrade pip
pip install jupyterlab
# install pyspark
pip install findspark pyspark
useradd py
mkdir -p /home/py
chown -R py:py /home/py
sudo -u py bash -c "mkdir /home/py/.jupyter"
sudo -u py bash -c "cat << EOF > /home/py/.jupyter/jupyter_notebook_config.py
conf = get_config()
conf.NotebookApp.ip = '*'
conf.NotebookApp.open_browser = False
conf.NotebookApp.port = 8888
conf.NotebookApp.token = 'jupyter'
EOF"
cat << EOF > /etc/systemd/system/jupyter.service
[Unit]
Description=Jupyter notebook
[Service]
Type=simple
Environment=SPARK_HOME=/usr/hdp/current/spark2-client
ExecStartPre=source /opt/anaconda/bin/activate
ExecStart=/opt/anaconda/bin/jupyter lab
User=py
Group=py
WorkingDirectory=/home/py
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl enable jupyter
sudo systemctl start jupyter
echo 'access -> http://192.168.1.117:8080'
echo 'user/password -> admin/admin'
echo ''
echo 'jupyter -> http://192.168.55.117:8888/?token=jupyter'
SHELL
end
〇動作確認用コード
import os
os.environ["HADOOP_USER_NAME"] = "hive"
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark.sql("SELECT * from csv.`/user/hive/test.csv`").show()
○関連情報
・Ambariに関する他の記事は
こちらを参照してください。