步骤如下:
1)检查kubeadm join所需的token
[root@d3-master-001 kubernetes]# kubeadm token create --ttl=0 --print-join-command
kubeadm join 10.24.10.114:8443 --token nbtrlg.0ualrssglhr9tuq5 --discovery-token-ca-cert-hash sha256:e5c136324bb642c2d93251f8db5b1ef7dc7fe44aed88ddec224d5bc6b1042b73
[root@d3-master-001 kubernetes]# kubeadm token list
TOKEN TTL EXPIRES USAGES DESCRIPTION EXTRA GROUPS
bakjyr.2n74ylbxpg81gfqr 23h 2019-10-12T10:36:57+08:00 authentication,signing <none> system:bootstrappers:kubeadm:default-node-token
nbtrlg.0ualrssglhr9tuq5 <forever> <never> authentication,signing <none> system:bootstrappers:kubeadm:default-node-token
pobzkr.w8ijb20i1arzfve0 1h 2019-10-11T12:36:57+08:00 <none> Proxy for managing TTL for the kubeadm-certs secret <none> system:bootstrappers:kubeadm:default-node-token
2)worker节点系统初始化配置
以d3-nginx-001 ~ 003为例: 修改主机名:
[root@d3-master-001 kubernetes]# for node in 10.24.10.{79..81};do ssh -p 5837 root@$node hostnamectl set-hostname d3-nginx-00$(expr $(echo $node|cut -d. -f4) - 78);done
[root@d3-master-001 kubernetes]# for node in 10.24.10.{79..81};do ssh -p 5837 root@$node hostname;done
d3-nginx-001
d3-nginx-002
d3-nginx-003
修改磁盘挂载设置,根分区200G,存放系统数据;/data分区900.3G,存放docker等数据:
[root@d3-master-001 kubernetes]# for node in 10.24.10.{79..81};do ssh -p 5837 root@$node sed -i 's#/data/resources#/data#' /etc/fstab;done
[root@d3-master-001 kubernetes]# for node in 10.24.10.{79..81};do ssh -p 5837 root@$node umount /data/resources;done
[root@d3-master-001 kubernetes]# for node in 10.24.10.{79..81};do ssh -p 5837 root@$node mount -a;done
[root@d3-master-001 kubernetes]# for node in 10.24.10.{79..81};do ssh -p 5837 root@$node df -hT;done
3)准备Nvidia GPU驱动
[root@d3-master-001 kubernetes]# mkdir nvidia
//老版本kernel-devel、kernel-headers的Yum仓库
[root@d3-master-001 kubernetes]# cat > nvidia/centos7_4.repo <<EOF
[centos7_4]
name=Archived kernel packages for centos7.4
baseurl=http://archive.kernel.org/centos-vault/7.4.1708/updates/x86_64/
enabled=1
gpgcheck=0
EOF
//屏蔽和Nvidia 驱动冲突的nouveau的配置文件
[root@d3-master-001 kubernetes]# cat > nvidia/blacklist-nouveau.conf <<EOF
blacklist nouveau
EOF
上传NVIDIA-Linux-x86_64-381.22.run和381.22.tar.gz到nvidia目录:
[root@d3-master-001 kubernetes]# ll nvidia
total 173472
-rw-r--r-- 1 root root 101042876 Jan 2 2018 381.22.tar.gz
-rw-r--r-- 1 root root 18 Oct 12 09:25 blacklist-nouveau.conf
-rw-r--r-- 1 root root 149 Oct 11 22:19 centos7_4.repo
-rw-r--r-- 1 root root 76581130 Oct 8 2018 NVIDIA-Linux-x86_64-381.22.run
4)准备nvidia-docker2软件包 上传nvidia-docker2软件包到docker目录:
[root@d3-master-001 kubernetes]# ll docker|grep nvidia
-rw-r--r-- 1 root root 78692 Oct 24 20:38 libnvidia-container1-1.0.5-1.x86_64.rpm
-rw-r--r-- 1 root root 36800 Oct 24 20:38 libnvidia-container-tools-1.0.5-1.x86_64.rpm
-rw-r--r-- 1 root root 1663260 Oct 24 20:38 nvidia-container-runtime-2.0.0-1.docker17.03.2.x86_64.rpm
-rw-r--r-- 1 root root 631268 Oct 24 20:38 nvidia-container-toolkit-1.0.5-2.x86_64.rpm
-rw-r--r-- 1 root root 4748 Oct 24 20:38 nvidia-docker2-2.0.3-1.docker17.03.2.ce.noarch.rpm
适用于Nvidia GPU节点的/etc/docker/daemon.json:
[root@d3-master-001 kubernetes]# cat > docker/docker-daemon-gpu.json <<EOF
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
},
"exec-opts": ["native.cgroupdriver=cgroupfs"],
"log-driver": "json-file",
"log-opts": {
"max-size": "1024m",
"max-file": "3"
},
"storage-driver": "overlay2",
"storage-opts": [
"overlay2.override_kernel_check=true"
]
}
EOF
5)准备Ansible Inventory文件
[root@d3-master-001 kubernetes]# cat > hosts-all-nodes <<EOF
[new-nodes]
d3-nginx-001 ansible_host=10.24.10.79 ansible_port=5837 ansible_user=root ansible_password=******
d3-nginx-002 ansible_host=10.24.10.80 ansible_port=5837 ansible_user=root ansible_password=******
d3-nginx-003 ansible_host=10.24.10.81 ansible_port=5837 ansible_user=root ansible_password=******
d3-kong-001 ansible_host=10.24.10.82 ansible_port=5837 ansible_user=root ansible_password=******
d3-kong-002 ansible_host=10.24.10.83 ansible_port=5837 ansible_user=root ansible_password=******
d3-gpu-066 ansible_host=10.24.10.94 ansible_port=5837 ansible_user=root ansible_password=******
...
EOF
6)准备Ansible Playbook文件
[root@d3-master-001 kubernetes]# cat > pb-kubeadm-join-worker.yaml <<EOF
- hosts: new-nodes
remote_user: root
vars:
# kubernetes
harborIP: 10.12.8.248
kubeadm_apiserver: 10.24.10.114:8443
kubeadm_token: nbtrlg.0ualrssglhr9tuq5
kubeadm_cahash: sha256:e5c136324bb642c2d93251f8db5b1ef7dc7fe44aed88ddec224d5bc6b1042b73
tasks:
# prepare system
- name: disable swap
shell: swapoff -a && sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
- name: disable firewalld
systemd: name=firewalld state=stopped enabled=no
- name: config iptables
shell: iptables -P INPUT ACCEPT && iptables -P FORWARD ACCEPT && iptables -P OUTPUT ACCEPT
- name: disable selinux
selinux: state=disabled
- name: add harbor's address to /etc/hosts
lineinfile:
path: /etc/hosts
regexp: 'docker\.v2\.aispeech\.com'
line: '{{harborIP}} docker.v2.aispeech.com'
# modify dns
- shell: nmcli con mod bond0 ipv4.dns "223.5.5.5 180.76.76.76 119.29.29.29"
name: modify bond0's dns servers
- shell: nmcli con mod bond0 ipv4.dns-options "options rotate options timeout:1 attempts:1"
name: modify bond0's dns options
- shell: nmcli con reload
name: reload bond0
# configure ntp
- name: install chrony
yum: name=chrony.x86_64 state=present
- name: enable chronyd.service
systemd: name=chronyd state=started enabled=yes
- name: configure ntp
shell: timedatectl set-timezone Asia/Shanghai && timedatectl set-ntp yes && timedatectl set-local-rtc 0
# configure yum repository
- name: archive original repository
shell: mkdir -p /etc/yum.repos.d/bak && mv /etc/yum.repos.d/*.repo /etc/yum.repos.d/bak
ignore_errors: yes
- name: copy yum repositories
copy:
src: '{{ item.src }}'
dest: '{{ item.dest }}'
with_items:
- {src: 'docker/CentOS-Base.repo',dest: '/etc/yum.repos.d/CentOS-Base.repo'}
- {src: 'docker/epel.repo',dest: '/etc/yum.repos.d/epel.repo'}
#- {src: 'docker/docker.repo',dest: '/etc/yum.repos.d/docker.repo'}
- {src: 'docker/kubernetes.repo',dest: '/etc/yum.repos.d/kubernetes.repo'}
- name: install some useful packages
yum:
name: nfs-utils,pciutils,psmisc,tcpdump,net-tools,telnet,nmap-ncat,bash-completion,tree
state: present
# install docker-ce
- name: copy packages of the new version of docker-ce
copy:
src: '{{ item.src }}'
dest: '{{ item.dest }}'
with_items:
- { src: 'docker/docker-ce-17.03.2.ce-1.el7.centos.x86_64.rpm', dest: '/tmp/docker-ce-17.03.2.ce-1.el7.centos.x86_64.rpm' }
- { src: 'docker/docker-ce-selinux-17.03.2.ce-1.el7.centos.noarch.rpm', dest: '/tmp/docker-ce-selinux-17.03.2.ce-1.el7.centos.noarch.rpm' }
- name: install new version of docker-ce
yum:
name: /tmp/docker-ce-17.03.2.ce-1.el7.centos.x86_64.rpm,/tmp/docker-ce-selinux-17.03.2.ce-1.el7.centos.noarch.rpm
- name: delete packages of the new version of docker-ce
file:
path: '{{ item.name }}'
state: absent
with_items:
- { name: '/tmp/docker-ce-17.03.2.ce-1.el7.centos.x86_64.rpm' }
- { name: '/tmp/docker-ce-selinux-17.03.2.ce-1.el7.centos.noarch.rpm' }
# install kubernetes
- name: install kubernetes v1.14.6
yum:
name: kubeadm-1.14.6-0.x86_64,kubelet-1.14.6-0.x86_64,kubectl-1.14.6-0.x86_64,kubernetes-cni-0.7.5-0.x86_64,cri-tools-1.13.0-0.x86_64
state: present
- debug: msg="Docker and kubernetes installed."
# configure docker
- debug: msg="Configing docker and kubernetes ..."
- name: mkdir /data/docker/
file: name=/data/docker/ state=directory
- name: mkdir /etc/docker/
file: path=/etc/docker/ state=directory
- name: mkdir /etc/systemd/system/docker.service.d/
file: path=/etc/systemd/system/docker.service.d/ state=directory
- name: copy docker.conf
copy:
src: docker/docker.conf
dest: /etc/systemd/system/docker.service.d/docker.conf
- name: copy daemon.json
copy:
src: docker/docker-daemon-cpu.json
dest: /etc/docker/daemon.json
# configure kubernetes
- name: copy k8s.conf
copy:
src: docker/k8s.conf
dest: /etc/sysctl.d/k8s.conf
- name: modify sysctl.conf
shell: sed -i '/vm.swappiness=1/d' /etc/sysctl.conf
- name: make k8s.conf effective
shell: sysctl --system
# install ipvs required kernel modules and packages for kube-proxy
- name: install ipvs required packages
yum:
name: conntrack-tools,ipvsadm,ipset
state: present
- name: load ipvs required kernel modules
lineinfile:
path: /usr/lib/systemd/system/kubelet.service
regexp: 'ExecStartPre'
insertafter: '\[Service\]'
line: 'ExecStartPre=/usr/sbin/modprobe -a ip_vs ip_vs_rr ip_vs_wrr ip_vs_sh nf_conntrack_ipv4'
- debug: msg="Docker and kubernetes configed ."
# install nvidia-driver and nvidia-docker2 for GPU nodes
- shell: lspci |grep -i nvidia|wc -l
register: number_gpu
- name: install nvidia-driver and nvidia-docker2 for GPU nodes
block:
- debug: msg="I'm a GPU node,installing nvidia-driver and nvidia-docker2 ..."
# install kernel headers and development packages
- name: copy centos7_4's yum repository
copy:
src: nvidia/centos7_4.repo
dest: /etc/yum.repos.d/centos7_4.repo
- name: install kernel headers and development packages
yum:
name: kernel-devel-{{ansible_kernel}},kernel-headers-{{ansible_kernel}}
state: present
- name: remove centos7_4's yum repository
file: path=/etc/yum.repos.d/centos7_4.repo state=absent
# blacklist nouveau
- name: copy blacklist-nouveau.conf
copy:
src: nvidia/blacklist-nouveau.conf
dest: /etc/modprobe.d/blacklist-nouveau.conf
- name: rmmod nouveau
shell: if [[ $(lsmod|grep nouveau) ]];then rmmod nouveau;fi
- name: backup current initramfs
shell: mv /boot/initramfs-{{ansible_kernel}}.img /boot/initramfs-$(uname -r).img.bak
- name: build new initramfs
shell: dracut /boot/initramfs-{{ansible_kernel}}.img {{ansible_kernel}}
- name: test if dracut succeeded
shell: ls /boot/initramfs-{{ansible_kernel}}.img
- name: reboot to disable nouveau
reboot:
reboot_timeout: 600
msg: "Reboot initiated by Ansible to disable nouveau"
#- name: Wait for the server to finish rebooting
# wait_for_connection: delay=30 sleep=5 timeout=600 connect_timeout=30
- name: gather facts for second time
setup:
# install nvidia-driver
- name: copy nvidia-driver executable file
copy:
src: nvidia/NVIDIA-Linux-x86_64-381.22.run
dest: /tmp/NVIDIA-Linux-x86_64-381.22.run
mode: a+x
- stat: path=/usr/bin/nvidia-smi
register: nvidia_driver_stat
- name: install nvidia-driver
shell: /tmp/NVIDIA-Linux-x86_64-381.22.run --silent
when: nvidia_driver_stat.stat.exists is defined and nvidia_driver_stat.stat.exists == False
- shell: nvidia-smi
register: nvidia_smi_output
- debug: var=nvidia_smi_output.stdout_lines
- file: path=/var/lib/nvidia-docker/volumes/nvidia_driver/ state=directory
- name: unarchive 381.22.tar.gz to cuda-aitf:20190221's hostPath
unarchive:
src: nvidia/381.22.tar.gz
dest: /var/lib/nvidia-docker/volumes/nvidia_driver/
# install nvidia-docker2
- name: copy packages of the new version of nvidia-docker2
copy:
src: '{{ item.src }}'
dest: '{{ item.dest }}'
with_items:
- { src: 'docker/libnvidia-container1-1.0.5-1.x86_64.rpm', dest: '/tmp/libnvidia-container1-1.0.5-1.x86_64.rpm' }
- { src: 'docker/libnvidia-container-tools-1.0.5-1.x86_64.rpm', dest: '/tmp/libnvidia-container-tools-1.0.5-1.x86_64.rpm' }
- { src: 'docker/nvidia-container-runtime-2.0.0-1.docker17.03.2.x86_64.rpm', dest: '/tmp/nvidia-container-runtime-2.0.0-1.docker17.03.2.x86_64.rpm' }
- { src: 'docker/nvidia-container-toolkit-1.0.5-2.x86_64.rpm', dest: '/tmp/nvidia-container-toolkit-1.0.5-2.x86_64.rpm' }
- { src: 'docker/nvidia-docker2-2.0.3-1.docker17.03.2.ce.noarch.rpm', dest: '/tmp/nvidia-docker2-2.0.3-1.docker17.03.2.ce.noarch.rpm' }
- name: install new version of nvidia-docker2
yum:
name: /tmp/libnvidia-container1-1.0.5-1.x86_64.rpm,/tmp/libnvidia-container-tools-1.0.5-1.x86_64.rpm,/tmp/nvidia-container-toolkit-1.0.5-2.x86_64.rpm,/tmp/nvidia-docker2-2.0.3-1.docker17.03.2.ce.noarch.rpm,/tmp/nvidia-container-runtime-2.0.0-1.docker17.03.2.x86_64.rpm
- name: delete packages of the new version of nvidia-docker2
file:
path: '{{ item.name }}'
state: absent
with_items:
- { name: '/tmp/libnvidia-container1-1.0.5-1.x86_64.rpm' }
- { name: '/tmp/libnvidia-container-tools-1.0.5-1.x86_64.rpm' }
- { name: '/tmp/nvidia-container-runtime-2.0.0-1.docker17.03.2.x86_64.rpm' }
- { name: '/tmp/nvidia-container-toolkit-1.0.5-2.x86_64.rpm' }
- { name: '/tmp/nvidia-docker2-2.0.3-1.docker17.03.2.ce.noarch.rpm' }
# set nvidia-docker as the default runtime
- name: copy daemon.json
copy:
src: docker/docker-daemon-gpu.json
dest: /etc/docker/daemon.json
- debug: msg="nvidia-driver and nvidia-docker2 installed ."
when: number_gpu.stdout|int >= 1
- debug: msg="I'm NOT a GPU node,skipped installing nvidia-driver and nvidia-docker2 ..."
when: number_gpu.stdout|int == 0
# start docker.service and enable kubelet.service
- debug: msg="Restarting docker.service and enable kubelet.service ..."
- name: start docker.service
systemd: name=docker state=restarted enabled=yes daemon_reload=yes
- name: enable kubelet.service
systemd: name=kubelet enabled=yes daemon_reload=yes
- debug: msg="docker.service started and kubelet.service enabled ."
# kubeadm join
- debug: msg="kubeadm join ..."
- name: kubeadm join
shell: /usr/bin/kubeadm join {{kubeadm_apiserver}} --token {{kubeadm_token}} --discovery-token-ca-cert-hash {{kubeadm_cahash}}
# kubectl label GPU node
- shell: nvidia-smi
register: nvidia_smi_output
ignore_errors: yes
- name: kubectl label GPU nodes
shell: /usr/bin/kubectl label nodes --kubeconfig /etc/kubernetes/kubelet.conf {{ ansible_hostname }} cuda-aitf=1
when: nvidia_smi_output.rc == 0
ignore_errors: yes
## config opsmind
#- debug: msg="Configing opsmind ..."
#- name: copy delete coredump script
# copy: src=/opt/script/delete_coredump.sh dest=/opt/ mode=a+x
#- name: copy oss_upload
# copy: src=/opt/script/oss_upload dest=/usr/bin/ mode=a+x
#- name: add crontab
# cron: name="delete coredump" minute="*/1" job="bash /opt/delete_coredump.sh"
#- name: copy reinstall_dogagent script
# copy: src=/opt/script/reinstall_dogagent.sh dest=/opt/ mode=a+x
#- name: reinstall doagent
# command: /bin/bash /opt/reinstall_dogagent.sh
# ignore_errors: yes
#- debug: msg="Opsmind configed ."
EOF
7)初始化并添加worker节点 yum安装的2.4版本Ansible不支持reboot机器之后重新连接,这一步使用最新版本的Ansible:
[root@d3-master-001 kubernetes]# ansible --version
ansible 2.4.2.0
config file = /etc/ansible/ansible.cfg
configured module search path = [u'/root/.ansible/plugins/modules', u'/usr/share/ansible/plugins/modules']
ansible python module location = /usr/lib/python2.7/site-packages/ansible
executable location = /usr/bin/ansible
python version = 2.7.5 (default, Aug 4 2017, 00:39:18) [GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]
[root@d3-master-001 kubernetes]# yum install git -y
[root@d3-master-001 kubernetes]# git clone git://github.com/ansible/ansible.git --recursive /usr/local/bin/ansible
[root@d3-master-001 kubernetes]# source /usr/local/bin/ansible/hacking/env-setup
[root@d3-master-001 kubernetes]# ansible --version
ansible 2.10.0.dev0
config file = /etc/ansible/ansible.cfg
configured module search path = [u'/root/.ansible/plugins/modules', u'/usr/share/ansible/plugins/modules']
ansible python module location = /usr/local/bin/ansible/lib/ansible
executable location = /usr/local/bin/ansible/bin/ansible
python version = 2.7.5 (default, Aug 4 2017, 00:39:18) [GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]
初始化并添加worker节点:
[root@d3-master-001 kubernetes]# source /usr/local/bin/ansible/hacking/env-setup
[root@d3-master-001 kubernetes]# ansible -i hosts-all-nodes new-nodes -m ping
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-all-nodes pb-kubeadm-join-worker.yaml --syntax-check
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-all-nodes pb-kubeadm-join-worker.yaml
因安装Nvidia GPU驱动后需要重启服务器,Jumperserver的老版本Ansible不支持reboot模块,这里使用d3-master-001上的较新版本Ansible添加节点。
1、如果是旧机器,登录OpsMind删除可能存在的同名服务器(OpsMind - 基础设施 - 服务器)。
2、登录d3-master-001节点,加载新版Ansible
[root@d3-master-001 ~]# source /usr/local/bin/ansible/hacking/env-setup
3、修改Ansible清单文件,将需要添加的节点信息添加到清单文件hosts-all-nodes的new-nodes组(删除该组内已经存在的机器信息),格式如下
[root@d3-master-001 ~]# cd ~/kubernetes && vi hosts-all-nodes
...
[new-nodes]
d3-gpu-073 ansible_host=10.24.10.101 ansible_port=5837 ansible_user=root ansible_ssh_private_key_file=~/.ssh/d3-guoke.pem
d3-gpu-074 ansible_host=10.24.10.102 ansible_port=5837 ansible_user=root ansible_ssh_private_key_file=~/.ssh/d3-guoke.pem
d3-gpu-077 ansible_host=10.24.10.105 ansible_port=5837 ansible_user=root ansible_ssh_private_key_file=~/.ssh/d3-guoke.pem
4、执行添加节点的脚本
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-all-nodes pb-kubeadm-join-worker.yaml