1、批量推送ssh公钥
[root@d3-master-001 kubernetes]# cat > ~/.ssh/id_rsa-d3-public-001.pub <<EOF
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQ8ZkEKcpDohWigd0DjasfJw6MfGEUoRojw526uNGUsiSxtP0BoBYmAxPhuqRVCSCGWCSpdAqAmStoXrfxrudRHS7TtFhzHsOTzyg5OnDOWp72AG62nLBlGeXGSsljjF9uc8T3r8iFzoZJtgoZC32UMG1iz68fn5IsD8kelDI6yuoIbJOOSPvAqQEA8prR2tHN/afflr9WHW4Y3tqBFOrDgxAJRMxHBfrJ31/nnFAi1K4hIB9MJPOnuv4s4qkPLEwNPLd9HvnvqGjTWk0L0UZ8i6g7TAyCIXfRleZPDXQKgg3VmTilBQbvF1mHx4wUHgsbMTQApgm7oLPStUqM9ReJ root@d3-public-001
EOF
[root@d3-master-001 kubernetes]# cat pb-modify-sshkey.yaml
- hosts: all
user: root
tasks:
- name: copy ssh key
authorized_key: user=root key="{{ lookup('file', '~/.ssh/id_rsa-d3-public-001.pub') }}"
tags:
- d3-public-001
[root@d3-master-001 kubernetes]# cat hosts-modify-sshkey
10.24.10.51 ansible_host=10.24.10.51 ansible_port=5837 ansible_user=root ansible_password=
...
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-modify-sshkey pb-modify-sshkey.yaml --syntax-check
playbook: pb-modify-sshkey.yaml
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-modify-sshkey pb-modify-sshkey.yaml
2、配置ssh无密码连接
[root@d3-master-001 kubernetes]# vi ~/.ssh/config
Host d3-master-001
Hostname 10.24.10.74
IdentityFile ~/.ssh/d3-guoke.pem
User root
Port 5837
IdentitiesOnly yes
Host d3-master-002
Hostname 10.24.10.75
IdentityFile ~/.ssh/d3-guoke.pem
User root
Port 5837
IdentitiesOnly yes
Host d3-master-003
Hostname 10.24.10.76
IdentityFile ~/.ssh/d3-guoke.pem
User root
Port 5837
IdentitiesOnly yes
...
3、kubeadm reset
使用下面的命令即可reset集群的所有master节点,然后重新执行kubeadm init操作:
[root@d3-master-001 kubernetes]# ansible k8s-masters -m shell -a "kubeadm reset -f"
[root@d3-master-001 kubernetes]# ansible k8s-masters -m shell -a "iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X"
[root@d3-master-001 kubernetes]# ansible k8s-masters -m shell -a "ipvsadm --clear"
[root@d3-master-001 kubernetes]# ansible k8s-masters -m systemd -a "name=kube-etcd state=stopped"
[root@d3-master-001 kubernetes]# ansible k8s-masters -m file -a "name=/var/lib/kube-etcd/ state=absent"
[root@d3-master-001 kubernetes]# ansible-playbook pb-kube-etcd.yaml
[root@d3-master-001 kubernetes]# ansible-playbook pb-master-certs.yaml
[root@d3-master-001 kubernetes]# ansible k8s-masters -m shell -a "modprobe -a ip_vs ip_vs_rr ip_vs_wrr ip_vs_sh nf_conntrack_ipv4"
[root@d3-master-001 kubernetes]# kubeadm init ...
对于修改了主机的hostname或者ip地址等网络属性、之前join过别的Kubernetes集群的节点,需要先清除flannel残留文件,否则可能导致flannel.1网络不通引起PodIP不通:
kubeadm reset -f
iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
ipvsadm --clear
ifconfig cni0 down && ip link delete cni0
ifconfig flannel.1 down && ip link delete flannel.1
ip link delete kube-ipvs0
rm -rf /var/lib/cni/
rm -rf /run/flannel
rm -rf /etc/cni
kubeadm join ...
4、使用udev rules实现网卡名称和MAC地址的绑定(@丁颖)
目的:避免服务器重启之后网卡名称变化导致bond0失效。
1)修改udev rules的脚本
[root@d3-master-001 kubernetes]# cat > gen-nic-udevrules <<"EOF"
> /etc/udev/rules.d/60-net.rules
ifconfig | egrep -v "^lo:|^bond|^flannel.1|^cni|^veth|^docker" | grep mtu|awk -F ":" '{print$1}' | while read i;do echo "ACTION==\"add\", SUBSYSTEM==\"net\", DRIVERS==\"?*\", ATTR{type}==\"1\", ATTR{address}==\"$(ifconfig $i | grep " ether "|awk '{print$2}')\", NAME=\"$i\"" >> /etc/udev/rules.d/60-net.rules;done
EOF
- 脚本有bug,当前bond0的一个slave网卡名称可能不对,需要人工检查、因为网卡绑定导致bond0的两个slave网卡mac地址一样,需要识别slave网卡的真实mac地址。
> /etc/udev/rules.d/60-net.rules
ifconfig | egrep "^eth0|^eth1|^eth2|^eth3|^em1|^em2|^em3|^em4"|grep mtu|awk -F ":" '{print$1}' | while read i;do echo "ACTION==\"add\", SUBSYSTEM==\"net\", DRIVERS==\"?*\", ATTR{type}==\"1\", ATTR{address}==\"$(ifconfig $i | grep " ether "|awk '{print$2}')\", NAME=\"$i\"";done > /etc/udev/rules.d/60-net.rules
cat /proc/net/bonding/bond0|grep "Slave Interface"|awk '{print $3}' | while read i;do echo "ACTION==\"add\", SUBSYSTEM==\"net\", DRIVERS==\"?*\", ATTR{type}==\"1\", ATTR{address}==\"$(cat /proc/net/bonding/bond0|grep -A5 $i|grep "Permanent HW addr"|awk '{print $4}')\", NAME=\"$i\"";done >> /etc/udev/rules.d/60-net.rules
2)测试修改udev rules的脚本
[root@d3-master-001 kubernetes]# chmod a+x ./gen-nic-udevrules && ./gen-nic-udevrules
[root@d3-master-001 kubernetes]# shutdown -r now
3)批量修改udev rules
[root@d3-master-001 kubernetes]# cat > hosts-all-nodes <<EOF
[all-nodes]
d3-master-001 ansible_host=10.24.10.74 ansible_port=5837 ansible_user=root ansible_password=SZaispeech@2016
...
EOF
[root@d3-master-001 kubernetes]# cat > pb-nic-udevrules.yaml <<EOF
- hosts: all-nodes
user: root
tasks:
- name: copy gen-nic-udevrules
copy: src=gen-nic-udevrules dest=/tmp/gen-nic-udevrules mode=a+x
- name: gen /etc/udev/rules.d/60-net.rules
shell: /tmp/gen-nic-udevrules
- name: reboot to apply nic udev rules
reboot:
reboot_timeout: 600
msg: "Reboot initiated by Ansible to apply nic udev rules"
- name: test
shell: hostname
EOF
[root@d3-master-001 kubernetes]# source /usr/local/bin/ansible/hacking/env-setup
[root@d3-master-001 kubernetes]# ansible -i hosts-all-nodes all-nodes -m ping
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-nic-udevrules pb-nic-udevrules.yaml
5、部署opsmind agent(@邓翔宇)
[root@d3-master-001 kubernetes]# ll script/
total 6100
-rwxr-xr-x 1 root root 201 Feb 20 2019 clear_images.sh
-rwxr-xr-x 1 root root 1691 Sep 1 2018 delete_coredump.sh
-rwxr-xr-x 1 root root 1663 Aug 31 2018 delete_coredump.sh.bak
-rwxr-xr-x 1 root root 249 Sep 12 2018 kill.sh
-rwxr-xr-x 1 root root 6218150 Aug 17 2018 oss_upload
-rw-r--r-- 1 root root 274 May 7 14:01 pynvml.sh
-rwxr-xr-x 1 root root 752 Feb 26 2019 reinstall_dogagent.sh
[root@d3-master-001 kubernetes]# cat > pb-opsmind-agent.yaml <<EOF
- hosts: all-nodes
user: root
tasks:
# config opsmind
- debug: msg="Configing opsmind ..."
#- name: copy delete coredump script
# copy: src=script/delete_coredump.sh dest=/opt/ mode=a+x
- name: copy oss_upload
copy: src=script/oss_upload dest=/usr/bin/ mode=a+x
#- name: add crontab
# cron: name="delete coredump" minute="*/1" job="bash /opt/delete_coredump.sh"
- name: copy reinstall_dogagent script
copy: src=script/reinstall_dogagent.sh dest=/opt/ mode=a+x
- name: reinstall doagent
command: /bin/bash /opt/reinstall_dogagent.sh
ignore_errors: yes
- debug: msg="Opsmind configed ."
EOF
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-all-nodes pb-opsmind-agent.yaml
6、修改节点的DNS服务器(@吴杰)
[root@d3-master-001 kubernetes]# cat > pb-modify-dns.yaml <<EOF
- hosts: all-nodes
user: root
tasks:
- shell: nmcli con mod bond0 ipv4.dns "223.5.5.5 180.76.76.76 119.29.29.29"
name: modify bond0's dns servers
- shell: nmcli con mod bond0 ipv4.dns-options "options rotate options timeout:1 attempts:1"
name: modify bond0's dns options
- shell: nmcli con up bond0
name: restart bond0
EOF
[root@d3-master-001 kubernetes]# ansible-playbook -i hosts-all-nodes pb-modify-dns.yaml
7、coredns添加自定义DNS记录(@吴杰)
[root@d3-master-001 ~]# kubectl edit cm -n kube-system coredns
apiVersion: v1
data:
Corefile: |
.:53 {
errors
health
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
upstream
fallthrough in-addr.arpa ip6.arpa
}
hosts {
47.110.162.54 d1-kafka-005
47.111.142.2 d1-kafka-006
47.111.156.211 d1-kafka-007
47.110.93.125 d1-kafka-008
47.110.160.244 d1-kafka-009
fallthrough
}
prometheus :9153
forward . /etc/resolv.conf
cache 30
loop
reload
loadbalance
}
kind: ConfigMap
metadata:
name: coredns
namespace: kube-system
- 添加了5条关于kafka的自定义DNS记录。
8、创建临时rsyslog服务器
使用临时rsyslog服务器,避免发生kong或者airobot的告警,区分不开是线上还是测试的集群。
1)修改/etc/rsyslog.conf配置文件
[root@d3-master-003 ~]# mkdir -p /data/rsyslog/
[root@d3-master-003 ~]# vi /etc/rsyslog.conf
...
# Provides UDP syslog reception
$ModLoad imudp
$UDPServerRun 514
# Provides TCP syslog reception
$ModLoad imtcp
$InputTCPServerRun 514
...
local1.* /data/rsyslog/nginx.log
local4.* /data/rsyslog/kong.log
[root@d3-master-003 ~]# systemctl restart rsyslog
[root@d3-master-003 ~]# netstat -lnptu|grep 514
tcp 0 0 0.0.0.0:514 0.0.0.0:* LISTEN 190128/rsyslogd
tcp6 0 0 :::514 :::* LISTEN 190128/rsyslogd
udp 0 0 0.0.0.0:514 0.0.0.0:* 190128/rsyslogd
udp6 0 0 :::514 :::* 190128/rsyslogd
2)修改configmap/cloud-config
[root@d3-master-003 ~]# ip addr|grep "10.24.10"
inet 10.24.10.76/24 brd 10.24.10.255 scope global bond0
然后修改cloud-config中的如下环境变量:
- SYSLOG_HOST,原值10.24.10.19,修改为d3-master-003的10.24.10.76;
- D3_SYSLOG_HOST ,原值10.24.10.19,修改为d3-master-003的10.24.10.76;
- 重启kong-plugin-gray和casrserver-airobot服务。
备注:2019年11月13日已停止临时rsyslog服务器,cloud-config中的环境变量已还原。
9、修改kube-proxy的ipvs模式负载均衡算法的尝试(@郭奕超)
当前Service对应的ipvs记录:
[root@d3-master-003 ~]# kubectl get svc -n odcp
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
error-404 NodePort 10.96.77.126 <none> 80:30645/TCP 4h57m
[root@d3-master-003 ~]# ipvsadm -Ln|grep -A1 "10.96.77.126"
TCP 10.96.77.126:80 rr
-> 10.244.15.70:80 Masq 1 0 0
- 在一个节点上修改负载均衡算法为最小连接lc:
[root@d3-master-003 ~]# ipvsadm -E -t 10.96.77.126:80 -s lc
[root@d3-master-003 ~]# ipvsadm -Ln|grep -A1 "10.96.77.126"
TCP 10.96.77.126:80 lc
-> 10.244.15.70:80 Masq 1 0 0
- 此时,观察其他节点d3-master-001的ipvs记录并没有同步修改:
[root@d3-master-001 ~]# ipvsadm -Ln|grep -A1 "10.96.77.126"
TCP 10.96.77.126:80 rr
-> 10.244.15.70:80 Masq 1 0 0
- 过一段时间之后,修改的ipvs记录自动恢复为rr模式:
[root@d3-master-001 ~]# ipvsadm -Ln|grep -A1 "10.96.77.126"
TCP 10.96.77.126:80 rr
-> 10.244.15.70:80 Masq 1 0 0
[root@d3-master-003 ~]# ipvsadm -Ln|grep -A1 "10.96.77.126"
TCP 10.96.77.126:80 rr
-> 10.244.15.70:80 Masq 1 0 0
- 查阅kubernetes官方github,存在要求kube-proxy支持修改负载均衡算法的pr,但是被驳回了:
https://github.com/kubernetes/kubernetes/pull/75556
https://github.com/kubernetes/kubernetes/issues/75502
10、部署node-problem-detector
node-problem-detector是Kubernetes官方的一个项目,旨在监控Kubernetes节点的健康状态,部署node-problem-detector组件后,kubectl describe node命令可以一目了然看到节点的各种状态,包括但不限于各种硬件故障、Linux系统故障、docker和kubelet守护进程状态。
项目Github地址:https://github.com/kubernetes/node-problem-detector
1)创建ConfigMap/node-problem-detector-config
[root@d3-master-001 ~]# kubectl apply -f - <<"EOF"
apiVersion: v1
data:
kernel-monitor.json: |
{
"plugin": "kmsg",
"logPath": "/dev/kmsg",
"lookback": "5m",
"bufferSize": 10,
"source": "kernel-monitor",
"metricsReporting": true,
"conditions": [
{
"type": "KernelDeadlock",
"reason": "KernelHasNoDeadlock",
"message": "kernel has no deadlock"
},
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only"
}
],
"rules": [
{
"type": "temporary",
"reason": "OOMKilling",
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
},
{
"type": "temporary",
"reason": "TaskHung",
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "temporary",
"reason": "UnregisterNetDevice",
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
},
{
"type": "temporary",
"reason": "KernelOops",
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
},
{
"type": "temporary",
"reason": "KernelOops",
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only"
}
]
}
docker-monitor.json: |
{
"plugin": "journald",
"pluginConfig": {
"source": "dockerd"
},
"logPath": "/var/log/journal",
"lookback": "5m",
"bufferSize": 10,
"source": "docker-monitor",
"metricsReporting": true,
"conditions": [
{
"type": "CorruptDockerOverlay2",
"reason": "NoCorruptDockerOverlay2",
"message": "docker overlay2 is functioning properly"
}
],
"rules": [
{
"type": "temporary",
"reason": "CorruptDockerImage",
"pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
},
{
"type": "permanent",
"condition": "CorruptDockerOverlay2",
"reason": "CorruptDockerOverlay2",
"pattern": "returned error: readlink /var/lib/docker/overlay2.*: invalid argument.*"
}
]
}
systemd-monitor.json: |
{
"plugin": "journald",
"pluginConfig": {
"source": "systemd"
},
"logPath": "/var/log/journal",
"lookback": "5m",
"bufferSize": 10,
"source": "systemd-monitor",
"metricsReporting": true,
"conditions": [],
"rules": [
{
"type": "temporary",
"reason": "KubeletStart",
"pattern": "Started Kubernetes kubelet."
},
{
"type": "temporary",
"reason": "DockerStart",
"pattern": "Starting Docker Application Container Engine..."
},
{
"type": "temporary",
"reason": "ContainerdStart",
"pattern": "Starting containerd container runtime..."
}
]
}
kernel-monitor-counter.json: |
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "5m",
"timeout": "1m",
"max_output_length": 80,
"concurrency": 1
},
"source": "kernel-monitor",
"metricsReporting": true,
"conditions": [
{
"type": "FrequentUnregisterNetDevice",
"reason": "NoFrequentUnregisterNetDevice",
"message": "node is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "FrequentUnregisterNetDevice",
"reason": "UnregisterNetDevice",
"path": "/home/kubernetes/bin/log-counter",
"args": [
"--journald-source=kernel",
"--log-path=/var/log/journal",
"--lookback=20m",
"--count=3",
"--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
],
"timeout": "1m"
}
]
}
systemd-monitor-counter.json: |
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "5m",
"timeout": "1m",
"max_output_length": 80,
"concurrency": 1
},
"source": "systemd-monitor",
"metricsReporting": true,
"conditions": [
{
"type": "FrequentKubeletRestart",
"reason": "NoFrequentKubeletRestart",
"message": "kubelet is functioning properly"
},
{
"type": "FrequentDockerRestart",
"reason": "NoFrequentDockerRestart",
"message": "docker is functioning properly"
},
{
"type": "FrequentContainerdRestart",
"reason": "NoFrequentContainerdRestart",
"message": "containerd is functioning properly"
}
],
"rules": [
{
"type": "permanent",
"condition": "FrequentKubeletRestart",
"reason": "FrequentKubeletRestart",
"path": "/home/kubernetes/bin/log-counter",
"args": [
"--journald-source=systemd",
"--log-path=/var/log/journal",
"--lookback=20m",
"--delay=5m",
"--count=5",
"--pattern=Started Kubernetes kubelet."
],
"timeout": "1m"
},
{
"type": "permanent",
"condition": "FrequentDockerRestart",
"reason": "FrequentDockerRestart",
"path": "/home/kubernetes/bin/log-counter",
"args": [
"--journald-source=systemd",
"--log-path=/var/log/journal",
"--lookback=20m",
"--count=5",
"--pattern=Starting Docker Application Container Engine..."
],
"timeout": "1m"
},
{
"type": "permanent",
"condition": "FrequentContainerdRestart",
"reason": "FrequentContainerdRestart",
"path": "/home/kubernetes/bin/log-counter",
"args": [
"--journald-source=systemd",
"--log-path=/var/log/journal",
"--lookback=20m",
"--count=5",
"--pattern=Starting containerd container runtime..."
],
"timeout": "1m"
}
]
}
system-stats-monitor.json: |
{
"disk": {
"metricsConfigs": {
"disk/io_time": {
"displayName": "disk/io_time"
},
"disk/weighted_io": {
"displayName": "disk/weighted_io"
},
"disk/avg_queue_len": {
"displayName": "disk/avg_queue_len"
}
},
"includeRootBlk": true,
"includeAllAttachedBlk": true,
"lsblkTimeout": "5s"
},
"host": {
"metricsConfigs": {
"host/uptime": {
"displayName": "host/uptime"
}
}
},
"invokeInterval": "60s"
}
kind: ConfigMap
metadata:
name: node-problem-detector-config
namespace: kube-system
EOF
configmap/node-problem-detector-config created
2)创建DaemonSet/node-problem-detector
[root@d3-master-001 ~]# kubectl apply -f - <<EOF
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-problem-detector
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: node-problem-detector-binding
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-problem-detector
subjects:
- kind: ServiceAccount
name: node-problem-detector
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-problem-detector
namespace: kube-system
labels:
k8s-app: node-problem-detector
version: v0.7.1
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: node-problem-detector
version: v0.7.1
updateStrategy:
rollingUpdate:
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
k8s-app: node-problem-detector
version: v0.7.1
kubernetes.io/cluster-service: "true"
spec:
containers:
- name: node-problem-detector
image: docker.v2.aispeech.com/gcr.io/node-problem-detector:v0.7.1
command:
- "/bin/sh"
- "-c"
- "exec /node-problem-detector --logtostderr --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json,/config/systemd-monitor.json --config.custom-plugin-monitor=/config/kernel-monitor-counter.json,/config/systemd-monitor-counter.json --config.system-stats-monitor=/config/system-stats-monitor.json"
securityContext:
privileged: true
resources:
limits:
cpu: "200m"
memory: "100Mi"
requests:
cpu: "20m"
memory: "20Mi"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
readOnly: true
- name: kmsg
mountPath: /dev/kmsg
readOnly: true
- name: localtime
mountPath: /etc/localtime
readOnly: true
- name: config
mountPath: /config
readOnly: true
volumes:
- name: log
hostPath:
path: /run/log/
- name: kmsg
hostPath:
path: /dev/kmsg
- name: localtime
hostPath:
path: /etc/localtime
- name: config
configMap:
name: node-problem-detector-config
items:
- key: kernel-monitor.json
path: kernel-monitor.json
- key: docker-monitor.json
path: docker-monitor.json
- key: systemd-monitor.json
path: systemd-monitor.json
- key: kernel-monitor-counter.json
path: kernel-monitor-counter.json
- key: systemd-monitor-counter.json
path: systemd-monitor-counter.json
- key: system-stats-monitor.json
path: system-stats-monitor.json
serviceAccountName: node-problem-detector
tolerations:
- operator: "Exists"
effect: "NoExecute"
- key: "CriticalAddonsOnly"
operator: "Exists"
EOF
serviceaccount/node-problem-detector created
clusterrolebinding.rbac.authorization.k8s.io/node-problem-detector-binding created
daemonset.apps/node-problem-detector created
准备容器镜像:
[root@daoker ~]# docker pull registry.cn-hangzhou.aliyuncs.com/istios/node-problem-detector:v0.7.1
[root@daoker ~]# docker tag registry.cn-hangzhou.aliyuncs.com/istios/node-problem-detector:v0.7.1 docker.v2.aispeech.com/gcr.io/node-problem-detector:v0.7.1
[root@daoker ~]# docker push docker.v2.aispeech.com/gcr.io/node-problem-detector:v0.7.1
3)测试node-problem-detector
[root@d3-master-001 ~]# kubectl describe nodes d3-master-002
Name: d3-master-002
Roles: <none>
Labels: beta.kubernetes.io/arch=amd64
beta.kubernetes.io/os=linux
kubernetes.io/arch=amd64
kubernetes.io/hostname=d3-master-002
kubernetes.io/os=linux
Annotations: flannel.alpha.coreos.com/backend-data: {"VtepMAC":"6e:26:6c:5c:3b:e3"}
flannel.alpha.coreos.com/backend-type: vxlan
flannel.alpha.coreos.com/kube-subnet-manager: true
flannel.alpha.coreos.com/public-ip: 10.24.10.75
node.alpha.kubernetes.io/ttl: 0
volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp: Thu, 24 Oct 2019 21:40:55 +0800
Taints: <none>
Unschedulable: false
Conditions:
Type Status LastHeartbeatTime LastTransitionTime Reason Message
---- ------ ----------------- ------------------ ------ -------
FrequentContainerdRestart False Thu, 31 Oct 2019 20:11:19 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentContainerdRestart containerd is functioning properly
KernelDeadlock False Thu, 31 Oct 2019 20:11:19 +0800 Thu, 31 Oct 2019 19:57:10 +0800 KernelHasNoDeadlock kernel has no deadlock
ReadonlyFilesystem False Thu, 31 Oct 2019 20:11:19 +0800 Thu, 31 Oct 2019 19:57:10 +0800 FilesystemIsNotReadOnly Filesystem is not read-only
CorruptDockerOverlay2 False Thu, 31 Oct 2019 20:11:19 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoCorruptDockerOverlay2 docker overlay2 is functioning properly
FrequentUnregisterNetDevice False Thu, 31 Oct 2019 20:11:19 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentUnregisterNetDevice node is functioning properly
FrequentKubeletRestart False Thu, 31 Oct 2019 20:11:19 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentKubeletRestart kubelet is functioning properly
FrequentDockerRestart False Thu, 31 Oct 2019 20:11:19 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentDockerRestart docker is functioning properly
MemoryPressure False Thu, 31 Oct 2019 20:11:39 +0800 Tue, 29 Oct 2019 10:13:06 +0800 KubeletHasSufficientMemory kubelet has sufficient memory available
DiskPressure False Thu, 31 Oct 2019 20:11:39 +0800 Tue, 29 Oct 2019 10:13:06 +0800 KubeletHasNoDiskPressure kubelet has no disk pressure
PIDPressure False Thu, 31 Oct 2019 20:11:39 +0800 Tue, 29 Oct 2019 10:13:06 +0800 KubeletHasSufficientPID kubelet has sufficient PID available
Ready True Thu, 31 Oct 2019 20:11:39 +0800 Tue, 29 Oct 2019 10:13:16 +0800 KubeletReady kubelet is posting ready status
Addresses:
InternalIP: 10.24.10.75
Hostname: d3-master-002
Capacity:
cpu: 12
ephemeral-storage: 206292968Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 32530512Ki
pods: 110
Allocatable:
cpu: 12
ephemeral-storage: 190119598995
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 32428112Ki
pods: 110
System Info:
Machine ID: c087a889676c404fa4c18a71e4f22dfb
System UUID: 4C4C4544-004B-5810-8053-C7C04F594332
Boot ID: d542deaf-47a3-4c05-91eb-c7da49cad6b8
Kernel Version: 3.10.0-693.el7.x86_64
OS Image: CentOS Linux 7 (Core)
Operating System: linux
Architecture: amd64
Container Runtime Version: docker://17.3.2
Kubelet Version: v1.14.6
Kube-Proxy Version: v1.14.6
PodCIDR: 10.244.1.0/24
Non-terminated Pods: (6 in total)
Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits AGE
--------- ---- ------------ ---------- --------------- ------------- ---
kube-system kube-apiserver-d3-master-002 250m (2%) 0 (0%) 0 (0%) 0 (0%) 2d9h
kube-system kube-controller-manager-d3-master-002 200m (1%) 0 (0%) 0 (0%) 0 (0%) 2d9h
kube-system kube-flannel-ds-amd64-5qj79 100m (0%) 100m (0%) 200Mi (0%) 200Mi (0%) 6d22h
kube-system kube-proxy-5kdjt 0 (0%) 0 (0%) 0 (0%) 0 (0%) 6d22h
kube-system kube-scheduler-d3-master-002 100m (0%) 0 (0%) 0 (0%) 0 (0%) 2d9h
kube-system node-problem-detector-mhw6c 20m (0%) 200m (1%) 20Mi (0%) 100Mi (0%) 14m
Allocated resources:
(Total limits may be over 100 percent, i.e., overcommitted.)
Resource Requests Limits
-------- -------- ------
cpu 670m (5%) 300m (2%)
memory 220Mi (0%) 300Mi (0%)
ephemeral-storage 0 (0%) 0 (0%)
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal NoFrequentUnregisterNetDevice 43m kernel-monitor, d3-master-002 Node condition FrequentUnregisterNetDevice is now: Unknown, reason: NoFrequentUnregisterNetDevice
Normal NoFrequentKubeletRestart 43m systemd-monitor, d3-master-002 Node condition FrequentKubeletRestart is now: Unknown, reason: NoFrequentKubeletRestart
Normal NoFrequentDockerRestart 42m systemd-monitor, d3-master-002 Node condition FrequentDockerRestart is now: Unknown, reason: NoFrequentDockerRestart
Normal NoFrequentContainerdRestart 42m systemd-monitor, d3-master-002 Node condition FrequentContainerdRestart is now: Unknown, reason: NoFrequentContainerdRestart
- 模拟故障:
[root@d3-master-001 ~]# ssh d3-master-002 "echo 'kernel: INFO: task docker:20744 blocked for more than 120 seconds.' >> /dev/kmsg"
[root@d3-master-001 ~]# kubectl describe nodes d3-master-002
Name: d3-master-002
Roles: <none>
Labels: beta.kubernetes.io/arch=amd64
beta.kubernetes.io/os=linux
kubernetes.io/arch=amd64
kubernetes.io/hostname=d3-master-002
kubernetes.io/os=linux
Annotations: flannel.alpha.coreos.com/backend-data: {"VtepMAC":"6e:26:6c:5c:3b:e3"}
flannel.alpha.coreos.com/backend-type: vxlan
flannel.alpha.coreos.com/kube-subnet-manager: true
flannel.alpha.coreos.com/public-ip: 10.24.10.75
node.alpha.kubernetes.io/ttl: 0
volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp: Thu, 24 Oct 2019 21:40:55 +0800
Taints: <none>
Unschedulable: false
Conditions:
Type Status LastHeartbeatTime LastTransitionTime Reason Message
---- ------ ----------------- ------------------ ------ -------
CorruptDockerOverlay2 False Thu, 31 Oct 2019 20:13:13 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoCorruptDockerOverlay2 docker overlay2 is functioning properly
FrequentUnregisterNetDevice False Thu, 31 Oct 2019 20:13:13 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentUnregisterNetDevice node is functioning properly
FrequentKubeletRestart False Thu, 31 Oct 2019 20:13:13 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentKubeletRestart kubelet is functioning properly
FrequentDockerRestart False Thu, 31 Oct 2019 20:13:13 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentDockerRestart docker is functioning properly
FrequentContainerdRestart False Thu, 31 Oct 2019 20:13:13 +0800 Thu, 31 Oct 2019 19:57:10 +0800 NoFrequentContainerdRestart containerd is functioning properly
KernelDeadlock True Thu, 31 Oct 2019 20:13:13 +0800 Thu, 31 Oct 2019 20:11:57 +0800 DockerHung kernel: INFO: task docker:20744 blocked for more than 120 seconds.
ReadonlyFilesystem False Thu, 31 Oct 2019 20:13:13 +0800 Thu, 31 Oct 2019 19:57:10 +0800 FilesystemIsNotReadOnly Filesystem is not read-only
MemoryPressure False Thu, 31 Oct 2019 20:12:39 +0800 Tue, 29 Oct 2019 10:13:06 +0800 KubeletHasSufficientMemory kubelet has sufficient memory available
DiskPressure False Thu, 31 Oct 2019 20:12:39 +0800 Tue, 29 Oct 2019 10:13:06 +0800 KubeletHasNoDiskPressure kubelet has no disk pressure
PIDPressure False Thu, 31 Oct 2019 20:12:39 +0800 Tue, 29 Oct 2019 10:13:06 +0800 KubeletHasSufficientPID kubelet has sufficient PID available
Ready True Thu, 31 Oct 2019 20:12:39 +0800 Tue, 29 Oct 2019 10:13:16 +0800 KubeletReady kubelet is posting ready status
Addresses:
InternalIP: 10.24.10.75
Hostname: d3-master-002
Capacity:
cpu: 12
ephemeral-storage: 206292968Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 32530512Ki
pods: 110
Allocatable:
cpu: 12
ephemeral-storage: 190119598995
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 32428112Ki
pods: 110
System Info:
Machine ID: c087a889676c404fa4c18a71e4f22dfb
System UUID: 4C4C4544-004B-5810-8053-C7C04F594332
Boot ID: d542deaf-47a3-4c05-91eb-c7da49cad6b8
Kernel Version: 3.10.0-693.el7.x86_64
OS Image: CentOS Linux 7 (Core)
Operating System: linux
Architecture: amd64
Container Runtime Version: docker://17.3.2
Kubelet Version: v1.14.6
Kube-Proxy Version: v1.14.6
PodCIDR: 10.244.1.0/24
Non-terminated Pods: (6 in total)
Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits AGE
--------- ---- ------------ ---------- --------------- ------------- ---
kube-system kube-apiserver-d3-master-002 250m (2%) 0 (0%) 0 (0%) 0 (0%) 2d9h
kube-system kube-controller-manager-d3-master-002 200m (1%) 0 (0%) 0 (0%) 0 (0%) 2d9h
kube-system kube-flannel-ds-amd64-5qj79 100m (0%) 100m (0%) 200Mi (0%) 200Mi (0%) 6d22h
kube-system kube-proxy-5kdjt 0 (0%) 0 (0%) 0 (0%) 0 (0%) 6d22h
kube-system kube-scheduler-d3-master-002 100m (0%) 0 (0%) 0 (0%) 0 (0%) 2d9h
kube-system node-problem-detector-mhw6c 20m (0%) 200m (1%) 20Mi (0%) 100Mi (0%) 16m
Allocated resources:
(Total limits may be over 100 percent, i.e., overcommitted.)
Resource Requests Limits
-------- -------- ------
cpu 670m (5%) 300m (2%)
memory 220Mi (0%) 300Mi (0%)
ephemeral-storage 0 (0%) 0 (0%)
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal NoFrequentUnregisterNetDevice 44m kernel-monitor, d3-master-002 Node condition FrequentUnregisterNetDevice is now: Unknown, reason: NoFrequentUnregisterNetDevice
Normal NoFrequentKubeletRestart 44m systemd-monitor, d3-master-002 Node condition FrequentKubeletRestart is now: Unknown, reason: NoFrequentKubeletRestart
Normal NoFrequentDockerRestart 44m systemd-monitor, d3-master-002 Node condition FrequentDockerRestart is now: Unknown, reason: NoFrequentDockerRestart
Normal NoFrequentContainerdRestart 44m systemd-monitor, d3-master-002 Node condition FrequentContainerdRestart is now: Unknown, reason: NoFrequentContainerdRestart
Warning TaskHung 3s kernel-monitor, d3-master-002 kernel: INFO: task docker:20744 blocked for more than 120 seconds.
Normal DockerHung 3s kernel-monitor, d3-master-002 Node condition KernelDeadlock is now: True, reason: DockerHung