部署Prometheus
下载kube-prometheus配置文件
[root@d3-master-001 kubernetes]# wget https://github.com/coreos/kube-prometheus/archive/v0.2.0.tar.gz
[root@d3-master-001 kubernetes]# tar -zxf v0.2.0.tar.gz
[root@d3-master-001 kubernetes]# cd kube-prometheus-0.2.0/manifests/
准备kube-prometheus使用的docker镜像
1)查看使用的docker镜像
[root@d3-master-001 manifests]# grep -H "baseImage: " *.yaml
alertmanager-alertmanager.yaml: baseImage: quay.io/prometheus/alertmanager
prometheus-prometheus.yaml: baseImage: quay.io/prometheus/prometheus
[root@d3-master-001 manifests]# grep -H "version: " alertmanager-alertmanager.yaml prometheus-prometheus.yaml
alertmanager-alertmanager.yaml: version: v0.18.0
prometheus-prometheus.yaml: version: v2.11.0
[root@d3-master-001 manifests]# grep -H "image: " *.yaml
0prometheus-operator-deployment.yaml: image: quay.io/coreos/prometheus-operator:v0.33.0
grafana-deployment.yaml: - image: grafana/grafana:6.2.2
kube-state-metrics-deployment.yaml: image: quay.io/coreos/kube-rbac-proxy:v0.4.1
kube-state-metrics-deployment.yaml: image: quay.io/coreos/kube-rbac-proxy:v0.4.1
kube-state-metrics-deployment.yaml: image: quay.io/coreos/kube-state-metrics:v1.7.2
kube-state-metrics-deployment.yaml: image: k8s.gcr.io/addon-resizer:1.8.4
node-exporter-daemonset.yaml: image: quay.io/prometheus/node-exporter:v0.18.1
node-exporter-daemonset.yaml: image: quay.io/coreos/kube-rbac-proxy:v0.4.1
prometheus-adapter-deployment.yaml: image: quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.1
[root@d3-master-001 manifests]# grep -H "quay.io/coreos" *.yaml
0prometheus-operator-deployment.yaml: - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1
0prometheus-operator-deployment.yaml: - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.33.0
- Grafana 6.3或以上版本支持root_url,这里直接使用最新版6.4.2。
2)下载并推送docker镜像到harbor仓库
# docker pull
docker pull quay.io/prometheus/alertmanager:v0.18.0
docker pull quay.io/prometheus/prometheus:v2.11.0
docker pull quay.io/coreos/prometheus-operator:v0.33.0
docker pull grafana/grafana:6.4.2
docker pull quay.io/coreos/kube-rbac-proxy:v0.4.1
docker pull quay.io/coreos/kube-state-metrics:v1.7.2
docker pull registry.cn-hangzhou.aliyuncs.com/istios/addon-resizer:1.8.4
docker pull quay.io/prometheus/node-exporter:v0.18.1
docker pull quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.1
docker pull quay.io/coreos/configmap-reload:v0.0.1
docker pull quay.io/coreos/prometheus-config-reloader:v0.33.0
# docker tag
docker tag quay.io/prometheus/alertmanager:v0.18.0 docker.v2.aispeech.com/gcr.io/alertmanager:v0.18.0
docker tag quay.io/prometheus/prometheus:v2.11.0 docker.v2.aispeech.com/gcr.io/prometheus:v2.11.0
docker tag quay.io/coreos/prometheus-operator:v0.33.0 docker.v2.aispeech.com/gcr.io/prometheus-operator:v0.33.0
docker tag grafana/grafana:6.4.2 docker.v2.aispeech.com/gcr.io/grafana:6.4.2
docker tag quay.io/coreos/kube-rbac-proxy:v0.4.1 docker.v2.aispeech.com/gcr.io/kube-rbac-proxy:v0.4.1
docker tag quay.io/coreos/kube-state-metrics:v1.7.2 docker.v2.aispeech.com/gcr.io/kube-state-metrics:v1.7.2
docker tag registry.cn-hangzhou.aliyuncs.com/istios/addon-resizer:1.8.4 docker.v2.aispeech.com/gcr.io/addon-resizer:1.8.4
docker tag quay.io/prometheus/node-exporter:v0.18.1 docker.v2.aispeech.com/gcr.io/node-exporter:v0.18.1
docker tag quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.1 docker.v2.aispeech.com/gcr.io/k8s-prometheus-adapter-amd64:v0.4.1
docker tag quay.io/coreos/configmap-reload:v0.0.1 docker.v2.aispeech.com/gcr.io/configmap-reload:v0.0.1
docker tag quay.io/coreos/prometheus-config-reloader:v0.33.0 docker.v2.aispeech.com/gcr.io/prometheus-config-reloader:v0.33.0
# docker push
docker push docker.v2.aispeech.com/gcr.io/alertmanager:v0.18.0
docker push docker.v2.aispeech.com/gcr.io/prometheus:v2.11.0
docker push docker.v2.aispeech.com/gcr.io/prometheus-operator:v0.33.0
docker push docker.v2.aispeech.com/gcr.io/grafana:6.4.2
docker push docker.v2.aispeech.com/gcr.io/kube-rbac-proxy:v0.4.1
docker push docker.v2.aispeech.com/gcr.io/kube-state-metrics:v1.7.2
docker push docker.v2.aispeech.com/gcr.io/addon-resizer:1.8.4
docker push docker.v2.aispeech.com/gcr.io/node-exporter:v0.18.1
docker push docker.v2.aispeech.com/gcr.io/k8s-prometheus-adapter-amd64:v0.4.1
docker push docker.v2.aispeech.com/gcr.io/configmap-reload:v0.0.1
docker push docker.v2.aispeech.com/gcr.io/prometheus-config-reloader:v0.33.0
3)修改yaml文件使用的镜像
[root@d3-master-001 manifests]# sed -i "s#quay.io/prometheus/alertmanager#docker.v2.aispeech.com/gcr.io/alertmanager#g" alertmanager-alertmanager.yaml
[root@d3-master-001 manifests]# sed -i "s#quay.io/prometheus/prometheus#docker.v2.aispeech.com/gcr.io/prometheus#g" prometheus-prometheus.yaml
[root@d3-master-001 manifests]# for yamlfile in $(ls *-{deployment,daemonset}.yaml);do
sed -i '{
s#quay.io/prometheus/alertmanager:v0.18.0#docker.v2.aispeech.com/gcr.io/alertmanager:v0.18.0#g
s#quay.io/prometheus/prometheus:v2.11.0#docker.v2.aispeech.com/gcr.io/prometheus:v2.11.0#g
s#quay.io/coreos/prometheus-operator:v0.33.0#docker.v2.aispeech.com/gcr.io/prometheus-operator:v0.33.0#g
s#grafana/grafana:6.4.2#docker.v2.aispeech.com/gcr.io/grafana:6.4.2#g
s#quay.io/coreos/kube-rbac-proxy:v0.4.1#docker.v2.aispeech.com/gcr.io/kube-rbac-proxy:v0.4.1#g
s#quay.io/coreos/kube-state-metrics:v1.7.2#docker.v2.aispeech.com/gcr.io/kube-state-metrics:v1.7.2#g
s#k8s.gcr.io/addon-resizer:1.8.4#docker.v2.aispeech.com/gcr.io/addon-resizer:1.8.4#g
s#quay.io/prometheus/node-exporter:v0.18.1#docker.v2.aispeech.com/gcr.io/node-exporter:v0.18.1#g
s#quay.io/coreos/k8s-prometheus-adapter-amd64:v0.4.1#docker.v2.aispeech.com/gcr.io/k8s-prometheus-adapter-amd64:v0.4.1#g
s#quay.io/coreos/configmap-reload:v0.0.1#docker.v2.aispeech.com/gcr.io/configmap-reload:v0.0.1#g
s#quay.io/coreos/prometheus-config-reloader:v0.33.0#docker.v2.aispeech.com/gcr.io/prometheus-config-reloader:v0.33.0#g
}' $yamlfile
done
部署kube-prometheus
1)应用kube-prometheus-0.2.0/manifests/,部署:
- The Prometheus Operator
- Highly available Prometheus
- Highly available Alertmanager
- Prometheus node-exporter
- Prometheus Adapter for Kubernetes Metrics APIs
- kube-state-metrics
- Grafana
[root@d3-master-001 manifests]# kubectl apply -f .
...
unable to recognize "prometheus-serviceMonitorKubeScheduler.yaml": no matches for kind "ServiceMonitor" in version "monitoring.coreos.com/v1"
unable to recognize "prometheus-serviceMonitorKubelet.yaml": no matches for kind "ServiceMonitor" in version "monitoring.coreos.com/v1"
[root@d3-master-001 manifests]# kubectl apply -f .
...
servicemonitor.monitoring.coreos.com/kube-scheduler created
servicemonitor.monitoring.coreos.com/kubelet created
2)应用kube-prometheus-0.2.0/experimental/custom-metrics-api/,添加基于自定义指标hpa需要的“custom.metrics.k8s.io”:
[root@d3-master-001 manifests]# kubectl apply -f ../experimental/custom-metrics-api/
clusterrolebinding.rbac.authorization.k8s.io/custom-metrics-server-resources created
apiservice.apiregistration.k8s.io/v1beta1.custom.metrics.k8s.io created
clusterrole.rbac.authorization.k8s.io/custom-metrics-server-resources created
configmap/adapter-config configured
clusterrolebinding.rbac.authorization.k8s.io/hpa-controller-custom-metrics created
servicemonitor.monitoring.coreos.com/sample-app created
service/sample-app created
deployment.apps/sample-app created
horizontalpodautoscaler.autoscaling/sample-app created
修改持久化存储配置
- 由于没有分布式存储,这里实现数据的持久化存储略显复杂,不过基本可以满足数据的持久化和服务的高可用;
- 由于PVC和PV的绑定和其名称无关,为避免PVC和PV的绑定名字错乱,这里在部署kube-prometheus之后,依次创建PV和Pod。
1)Prometheus
a)准备后端存储Volume
[root@d3-gpu-066 ~]# mkdir -p /data/kube-prometheus/prometheus
[root@d3-gpu-067 ~]# mkdir -p /data/kube-prometheus/prometheus
b)创建第一个PersistentVolume/pv-prometheus-k8s-0
[root@d3-master-001 manifests]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-prometheus-k8s-0
#labels:
# pv-name: pv-prometheus-k8s-0
spec:
local:
path: /data/kube-prometheus/prometheus
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
# Local volume requires node affinity
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-gpu-066
EOF
persistentvolume/pv-prometheus-k8s-0 created
c)创建第一个使用持久化存储的Pod/prometheus-k8s-0
[root@d3-master-001 manifests]# kubectl edit prometheus -n monitoring k8s
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
labels:
prometheus: k8s
name: k8s
namespace: monitoring
spec:
...
replicas: 1
retention: 1y
storage:
volumeClaimTemplate:
metadata:
name: prometheus-k8s-db
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
nodeSelector:
kubernetes.io/os: linux
kube-prometheus: "1"
EOF
prometheus.monitoring.coreos.com/k8s edited
d)添加节点标签kube-prometheus=1
[root@d3-master-001 manifests]# kubectl label nodes d3-gpu-066 kube-prometheus=1
node/d3-gpu-066 labeled
[root@d3-master-001 manifests]# kubectl label nodes d3-gpu-067 kube-prometheus=1
node/d3-gpu-067 labeled
[root@d3-master-001 manifests]# kubectl get pods -n monitoring -l app=prometheus -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
prometheus-k8s-0 3/3 Running 1 65s 10.244.16.89 d3-gpu-066 <none> <none>
[root@d3-master-001 manifests]# kubectl get pvc -n monitoring
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
prometheus-k8s-db-prometheus-k8s-0 Bound pv-prometheus-k8s-0 100Gi RWO 91s
[root@d3-master-001 manifests]# kubectl get pv
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
pv-prometheus-k8s-0 100Gi RWO Retain Bound monitoring/prometheus-k8s-db-prometheus-k8s-0 2m38s
e)创建第二个PersistentVolume/pv-prometheus-k8s-1和Pod/prometheus-k8s-1
[root@d3-master-001 manifests]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-prometheus-k8s-1
spec:
local:
path: /data/kube-prometheus/prometheus
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
# Local volume requires node affinity
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-gpu-067
EOF
persistentvolume/pv-prometheus-k8s-1 created
[root@d3-master-001 manifests]# kubectl patch prometheus -n monitoring k8s --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":2}]'
prometheus.monitoring.coreos.com/k8s patched
[root@d3-master-001 manifests]# kubectl get pods -n monitoring -l app=prometheus -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
prometheus-k8s-0 3/3 Running 1 2m38s 10.244.16.89 d3-gpu-066 <none> <none>
prometheus-k8s-1 3/3 Running 1 17s 10.244.17.96 d3-gpu-067 <none> <none>
[root@d3-master-001 manifests]# kubectl get pvc -n monitoring
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
prometheus-k8s-db-prometheus-k8s-0 Bound pv-prometheus-k8s-0 100Gi RWO 2m44s
prometheus-k8s-db-prometheus-k8s-1 Bound pv-prometheus-k8s-1 100Gi RWO 23s
[root@d3-master-001 manifests]# kubectl get pvc -n monitoring prometheus-k8s-db-prometheus-k8s-0 -o yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
annotations:
pv.kubernetes.io/bind-completed: "yes"
pv.kubernetes.io/bound-by-controller: "yes"
creationTimestamp: "2019-11-12T00:02:39Z"
finalizers:
- kubernetes.io/pvc-protection
labels:
app: prometheus
prometheus: k8s
name: prometheus-k8s-db-prometheus-k8s-0
namespace: monitoring
resourceVersion: "13824630"
selfLink: /api/v1/namespaces/monitoring/persistentvolumeclaims/prometheus-k8s-db-prometheus-k8s-0
uid: bdc4f966-04df-11ea-b539-141877685738
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
volumeMode: Filesystem
volumeName: pv-prometheus-k8s-0
status:
accessModes:
- ReadWriteOnce
capacity:
storage: 100Gi
phase: Bound
2)Alertmanager
a)准备后端存储Volume
[root@d3-gpu-066 ~]# mkdir -p /data/kube-prometheus/alertmanager
[root@d3-gpu-067 ~]# mkdir -p /data/kube-prometheus/alertmanager
b)创建第一个PersistentVolume/pv-alertmanager-main-0
[root@d3-master-001 manifests]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-alertmanager-main-0
spec:
local:
path: /data/kube-prometheus/alertmanager
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-gpu-066
EOF
persistentvolume/pv-alertmanager-main-0 created
c)创建第一个使用持久化存储的Pod/
[root@d3-master-001 manifests]# kubectl edit alertmanagers -n monitoring main
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
labels:
alertmanager: main
name: main
namespace: monitoring
spec:
...
replicas: 2
retention: 168h
storage:
volumeClaimTemplate:
metadata:
name: alertmanager-main-db
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi
nodeSelector:
kubernetes.io/os: linux
kube-prometheus: "1"
alertmanager.monitoring.coreos.com/main edited
[root@d3-master-001 manifests]# kubectl get pods -n monitoring -o wide -l app=alertmanager
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
alertmanager-main-0 2/2 Running 0 7s 10.244.16.90 d3-gpu-066 <none> <none>
d)创建第二个PersistentVolume/pv-alertmanager-main-1和Pod/
[root@d3-master-001 manifests]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-alertmanager-main-1
spec:
local:
path: /data/kube-prometheus/alertmanager
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-gpu-067
EOF
persistentvolume/pv-alertmanager-main-1 created
[root@d3-master-001 manifests]# kubectl patch alertmanagers -n monitoring main --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":2}]'
alertmanager.monitoring.coreos.com/main patched
[root@d3-master-001 manifests]# kubectl get pods -n monitoring -o wide -l app=alertmanager
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
alertmanager-main-0 2/2 Running 0 8s 10.244.16.91 d3-gpu-066 <none> <none>
alertmanager-main-1 2/2 Running 0 18s 10.244.17.97 d3-gpu-067 <none> <none>
3)Grafana
a)修改默认添加的图表在Grafana面板中的文件夹
[root@d3-master-001 manifests]# kubectl edit configmaps -n monitoring grafana-dashboards
apiVersion: v1
data:
dashboards.yaml: |-
{
"apiVersion": 1,
"providers": [
{
"folder": "Kubernetes",
"name": "prometheus",
"options": {
"path": "/grafana-dashboard-definitions/0"
},
"orgId": 1,
"type": "file"
}
]
}
kind: ConfigMap
metadata:
name: grafana-dashboards
namespace: monitoring
b)创建Grafana使用的hostPath并修改目录权限
[root@d3-gpu-066 ~]# mkdir -p /data/kube-prometheus/grafana/ && chown 472:472 /data/kube-prometheus/grafana/
[root@d3-gpu-067 ~]# mkdir -p /data/kube-prometheus/grafana/ && chown 472:472 /data/kube-prometheus/grafana/
c)Grafana单节点部署,使用hostPath
[root@d3-gpu-059 ~]# mkdir -p /data/kube-prometheus/grafana/ && chown 472:472 /data/kube-prometheus/grafana/
[root@d3-gpu-059 ~]# chown 472:472 /data/kube-prometheus/grafana/
[root@d3-master-001 manifests]# kubectl edit deployments -n monitoring grafana
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
app: grafana
name: grafana
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
...
nodeSelector:
beta.kubernetes.io/os: linux
kube-prometheus: "1"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-gpu-066
volumes:
- name: grafana-storage
hostPath:
path: /data/kube-prometheus/grafana
...
securityContext:
runAsNonRoot: true
fsGroup: 472
runAsUser: 472
deployment.extensions/grafana edited
- nodeAffinity确保Grafana尽量部署到d3-gpu-066,在没有分布式存储的情况下可以尽量实现Grafana数据的持久性;同时,如果d3-gpu-066节点挂掉,Grafana会被调度到其他拥有kube-prometheus=1标签的节点(这里即d3-gpu-067),可以尽量确保Grafana的高可用。
- Grafana容器默认使用ID为472的用户,需要设置securityContext,同时修改的hostPath对应目录的权限,否则容器无法启动。
域名访问配置
Prometheus/Alertmanager/Grafana共用同一个域名k8s-gk.duiopen.com,使用subpath实现。
1)Prometheus域名访问配置
a)Prometheus/k8s配置
[root@d3-master-001 manifests]# kubectl edit prometheus -n monitoring k8s
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
labels:
prometheus: k8s
name: k8s
namespace: monitoring
spec:
...
externalUrl: http://k8s-gk.duiopen.com/prometheus
prometheus.monitoring.coreos.com/k8s edited
[root@d3-master-001 kubernetes]# kubectl get sts -n monitoring prometheus-k8s -o yaml|grep -A17 "template:"
template:
metadata:
creationTimestamp: null
labels:
app: prometheus
prometheus: k8s
spec:
containers:
- args:
- --web.console.templates=/etc/prometheus/consoles
- --web.console.libraries=/etc/prometheus/console_libraries
- --config.file=/etc/prometheus/config_out/prometheus.env.yaml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=1y
- --web.enable-lifecycle
- --storage.tsdb.no-lockfile
- --web.external-url=http://k8s-gk.duiopen.com/prometheus
- --web.route-prefix=/
b)Ingress/prometheus-ingress配置
[root@d3-master-001 manifests]# kubectl apply -f - <<"EOF"
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: 'false'
nginx.ingress.kubernetes.io/whitelist-source-range: "58.210.212.110"
nginx.ingress.kubernetes.io/configuration-snippet: |
rewrite ^/prometheus(.*)$ $1 break;
generation: 3
name: prometheus-ingress
namespace: monitoring
spec:
rules:
- host: k8s-gk.duiopen.com
http:
paths:
- backend:
serviceName: prometheus-k8s
servicePort: 9090
path: /prometheus
EOF
ingress.extensions/prometheus-ingress created
2)Alertmanager域名访问配置
a)Alertmanager/main配置
[root@d3-master-001 manifests]# kubectl edit alertmanagers -n monitoring main
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: main
namespace: monitoring
spec:
...
externalUrl: http://k8s-gk.duiopen.com/alertmanager/
b)Ingress/alertmanager-ingress配置
[root@d3-master-001 manifests]# kubectl apply -f - <<"EOF"
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: 'false'
nginx.ingress.kubernetes.io/whitelist-source-range: "58.210.212.110"
nginx.ingress.kubernetes.io/configuration-snippet: |
rewrite ^/alertmanager/(.*)$ /$1 break;
name: alertmanager-ingress
namespace: monitoring
spec:
rules:
- host: k8s-gk.duiopen.com
http:
paths:
- backend:
serviceName: alertmanager-main
servicePort: 9093
path: /alertmanager
EOF
ingress.extensions/alertmanager-ingress created
3)Grafana域名访问配置
a)创建保存Grafana配置文件grafana.ini的ConfigMap
[root@d3-master-001 manifests]# kubectl apply -f - <<EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-config
namespace: monitoring
data:
grafana.ini: |
[server]
# Protocol (http, https, socket)
protocol = http
# The ip address to bind to, empty will bind to all interfaces
;http_addr =
# The http port to use
http_port = 3000
# The public facing domain name used to access grafana from a browser
domain = k8s-gk.duiopen.com
#This is the full URL used to access Grafana from a web browser. This is important if you use Google or GitHub OAuth authentication (for the callback URL to be correct).
# This setting is also important if you have a reverse proxy in front of Grafana that exposes it through a subpath. In that case add the subpath to the end of this URL setting.
root_url = http://k8s-gk-monitor.duiopen.com:3000/grafana
#root_url = root_url = %(protocol)s://%(domain)s:%(http_port)s/
# Serve Grafana from subpath specified in root_url setting. By default it is set to false for compatibility reasons.
# By enabling this setting and using a subpath in root_url above, e.g. root_url = http://localhost:3000/grafana, Grafana will be accessible on http://localhost:3000/grafana.
serve_from_sub_path = true
# Set this option to true to enable HTTP compression, this can improve transfer speed and bandwidth utilization. It is recommended that most users set it to true. By default it is set to false for compatibility reasons.
enable_gzip = true
EOF
configmap/grafana-config created
b)修改Deployment/grafana挂载ConfigMap/grafana-config到/etc/grafana/grafana.ini
[root@d3-master-001 manifests]# kubectl edit deployments -n monitoring grafana
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: grafana
namespace: monitoring
spec:
template:
metadata:
labels:
app: grafana
spec:
containers:
- name: grafana
volumeMounts:
- mountPath: /etc/grafana
name: grafana-config
...
volumes:
- configMap:
name: grafana-config
name: grafana-config
...
deployment.extensions/grafana edited
c)Ingress/grafana-ingress配置
[root@d3-master-001 manifests]# kubectl apply -f - <<"EOF"
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: 'false'
nginx.ingress.kubernetes.io/whitelist-source-range: "58.210.212.110"
nginx.ingress.kubernetes.io/configuration-snippet: |
rewrite ^/grafana/(.*)$ /$1 break;
name: grafana-ingress
namespace: monitoring
spec:
rules:
- host: k8s-gk.duiopen.com
http:
paths:
- backend:
serviceName: grafana
servicePort: 3000
path: /grafana
EOF
ingress.extensions/grafana-ingress created
2019年12月09日:Prometheus节点调整
1、服务和节点详情
1)涉及的服务:2个StatefulSet、4个Deployment、1个DaemonSet
[root@d3-master-001 ~]# kubectl get sts -n monitoring
NAME READY AGE
alertmanager-main 2/2 38d
prometheus-k8s 2/2 38d
[root@d3-master-001 ~]# kubectl get deployments -n monitoring
NAME READY UP-TO-DATE AVAILABLE AGE
grafana 1/1 1 1 38d
kube-state-metrics 1/1 1 1 38d
prometheus-adapter 1/1 1 1 38d
prometheus-operator 1/1 1 1 38d
[root@d3-master-001 ~]# kubectl get ds -n monitoring
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
node-exporter 21 21 21 21 21 kubernetes.io/os=linux 38d
2)涉及的节点:d3-nginx-001 ~ d3-nginx-003
[root@d3-master-001 ~]# kubectl label nodes d3-nginx-001 kube-prometheus=1
node/d3-nginx-001 labeled
[root@d3-master-001 ~]# kubectl label nodes d3-nginx-002 kube-prometheus=1
node/d3-nginx-002 labeled
[root@d3-master-001 ~]# kubectl label nodes d3-nginx-003 kube-prometheus=1
node/d3-nginx-003 labeled
[root@d3-master-001 ~]# kubectl label nodes d3-gpu-066 kube-prometheus-
node/d3-gpu-066 labeled
[root@d3-master-001 ~]# kubectl label nodes d3-gpu-067 kube-prometheus-
node/d3-gpu-067 labeled
2、调度prometheus、alertmanager到d3-nginx-001和d3-nginx-002
1)prometheus
[root@d3-master-001 ~]# kubectl patch prometheus -n monitoring k8s --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":0}]'
prometheus.monitoring.coreos.com/k8s patched
[root@d3-master-001 ~]# kubectl delete pvc -n monitoring -l app=prometheus --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
persistentvolumeclaim "prometheus-k8s-db-prometheus-k8s-0" force deleted
persistentvolumeclaim "prometheus-k8s-db-prometheus-k8s-1" force deleted
[root@d3-master-001 ~]# kubectl delete pv pv-prometheus-k8s-0 --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
persistentvolume "pv-prometheus-k8s-0" force deleted
[root@d3-master-001 ~]# kubectl delete pv pv-prometheus-k8s-1 --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
persistentvolume "pv-prometheus-k8s-1" force deleted
[root@d3-nginx-001 ~]# mkdir -p /data/kube-prometheus/{prometheus,alertmanager}
[root@d3-nginx-002 ~]# mkdir -p /data/kube-prometheus/{prometheus,alertmanager}
[root@d3-master-001 ~]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-prometheus-k8s-0
spec:
local:
path: /data/kube-prometheus/prometheus
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
# Local volume requires node affinity
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-001
EOF
persistentvolume/pv-prometheus-k8s-0 created
[root@d3-master-001 ~]# kubectl patch prometheus -n monitoring k8s --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":1}]'
prometheus.monitoring.coreos.com/k8s patched
[root@d3-master-001 ~]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-prometheus-k8s-1
spec:
local:
path: /data/kube-prometheus/prometheus
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
# Local volume requires node affinity
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-002
EOF
persistentvolume/pv-prometheus-k8s-1 created
[root@d3-master-001 ~]# kubectl patch prometheus -n monitoring k8s --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":2}]'
prometheus.monitoring.coreos.com/k8s patched
[root@d3-master-001 ~]# kubectl get pvc -n monitoring -l app=prometheus
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
prometheus-k8s-db-prometheus-k8s-0 Bound pv-prometheus-k8s-0 100Gi RWO 57s
prometheus-k8s-db-prometheus-k8s-1 Bound pv-prometheus-k8s-1 100Gi RWO 20s
[root@d3-master-001 ~]# kubectl get pods -n monitoring -l app=prometheus -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
prometheus-k8s-0 3/3 Running 1 3m25s 10.244.43.3 d3-nginx-001 <none> <none>
prometheus-k8s-1 3/3 Running 1 2m48s 10.244.41.3 d3-nginx-002 <none> <none>
2)alertmanager
[root@d3-master-001 ~]# kubectl patch alertmanager -n monitoring main --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":0}]'
alertmanager.monitoring.coreos.com/main patched
[root@d3-master-001 ~]# kubectl delete pvc -n monitoring -l app=alertmanager --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
persistentvolumeclaim "alertmanager-main-db-alertmanager-main-0" force deleted
persistentvolumeclaim "alertmanager-main-db-alertmanager-main-1" force deleted
[root@d3-master-001 ~]# kubectl delete pv pv-alertmanager-main-0 pv-alertmanager-main-1 --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
persistentvolume "pv-alertmanager-main-0" force deleted
persistentvolume "pv-alertmanager-main-1" force deleted
[root@d3-master-001 ~]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-alertmanager-main-0
spec:
local:
path: /data/kube-prometheus/alertmanager
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-001
EOF
persistentvolume/pv-alertmanager-main-0 created
[root@d3-master-001 ~]# kubectl patch alertmanager -n monitoring main --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":1}]'
alertmanager.monitoring.coreos.com/main patched
[root@d3-master-001 ~]# kubectl apply -f - <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-alertmanager-main-1
spec:
local:
path: /data/kube-prometheus/alertmanager
capacity:
storage: 20Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-002
EOF
persistentvolume/pv-alertmanager-main-1 created
[root@d3-master-001 ~]# kubectl patch alertmanager -n monitoring main --type='json' --patch='[{"op": "replace", "path":"/spec/replicas","value":2}]'
alertmanager.monitoring.coreos.com/main patched
[root@d3-master-001 ~]# kubectl get pvc -n monitoring -l app=alertmanager
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
alertmanager-main-db-alertmanager-main-0 Bound pv-alertmanager-main-0 20Gi RWO 50s
alertmanager-main-db-alertmanager-main-1 Bound pv-alertmanager-main-1 20Gi RWO 22s
[root@d3-master-001 ~]# kubectl get pods -n monitoring -l app=alertmanager -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
alertmanager-main-0 2/2 Running 0 23s 10.244.43.5 d3-nginx-001 <none> <none>
alertmanager-main-1 2/2 Running 0 38s 10.244.41.4 d3-nginx-002 <none> <none>
3、优先调度grafana、kube-state-metrics、prometheus-adapter、prometheus-operator到d3-nginx-003,d3-nginx-003无法调度时调度到d3-nginx-001或者d3-nginx-002。
1)grafana
[root@d3-nginx-001 ~]# mkdir -p /data/kube-prometheus/grafana && chown 472:472 /data/kube-prometheus/grafana/
[root@d3-nginx-002 ~]# mkdir -p /data/kube-prometheus/grafana && chown 472:472 /data/kube-prometheus/grafana/
[root@d3-nginx-003 ~]# mkdir -p /data/kube-prometheus/grafana && chown 472:472 /data/kube-prometheus/grafana/
[root@d3-master-001 manifests]# kubectl edit deployments -n monitoring grafana
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
app: grafana
name: grafana
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
...
nodeSelector:
beta.kubernetes.io/os: linux
kube-prometheus: "1"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-003
volumes:
- name: grafana-storage
hostPath:
path: /data/kube-prometheus/grafana
...
securityContext:
runAsNonRoot: true
fsGroup: 472
runAsUser: 472
deployment.extensions/grafana edited
[root@d3-master-001 ~]# kubectl delete pods -n monitoring -l app=grafana --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
pod "grafana-8fdccb846-br7s8" force deleted
[root@d3-master-001 ~]# kubectl get pods -n monitoring -l app=grafana -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
grafana-8fdccb846-99hmq 1/1 Running 0 14s 10.244.39.4 d3-nginx-003 <none> <none>
2)kube-state-metrics
[root@d3-master-001 manifests]# kubectl edit deployments -n monitoring kube-state-metrics
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
app: kube-state-metrics
name: kube-state-metrics
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
nodeSelector:
beta.kubernetes.io/os: linux
kube-prometheus: "1"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-003
...
deployment.extensions/kube-state-metrics edited
[root@d3-master-001 ~]# kubectl delete pods -n monitoring -l app=kube-state-metrics --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
pod "kube-state-metrics-75c4c48669-nzx4n" force deleted
[root@d3-master-001 ~]# kubectl get pods -n monitoring -l app=kube-state-metrics -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-state-metrics-75c4c48669-fb8gg 4/4 Running 0 17s 10.244.39.6 d3-nginx-003 <none> <none>
3)prometheus-adapter
[root@d3-master-001 manifests]# kubectl edit deployments -n monitoring prometheus-adapter
apiVersion: apps/v1beta2
kind: Deployment
metadata:
name: prometheus-adapter
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
name: prometheus-adapter
template:
metadata:
labels:
name: prometheus-adapter
spec:
nodeSelector:
beta.kubernetes.io/os: linux
kube-prometheus: "1"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-003
...
deployment.extensions/prometheus-adapter edited
[root@d3-master-001 ~]# kubectl delete pods -n monitoring -l name=prometheus-adapter --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
pod "prometheus-adapter-5b64874445-s4s6k" force deleted
[root@d3-master-001 ~]# kubectl get pods -n monitoring -l name=prometheus-adapter -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
prometheus-adapter-5b64874445-79fz6 1/1 Running 0 12s 10.244.39.8 d3-nginx-003 <none> <none>
4)prometheus-operator
[root@d3-master-001 manifests]# kubectl edit deployments -n monitoring prometheus-operator
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.33.0
name: prometheus-operator
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
template:
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.33.0
spec:
nodeSelector:
beta.kubernetes.io/os: linux
kube-prometheus: "1"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- d3-nginx-003
...
deployment.extensions/prometheus-operator edited
[root@d3-master-001 ~]# kubectl delete pods -n monitoring -l app.kubernetes.io/name=prometheus-operator --force --grace-period=0
warning: Immediate deletion does not wait for confirmation that the running resource has been terminated. The resource may continue to run on the cluster indefinitely.
pod "prometheus-operator-ccc557658-z67f6" force deleted
[root@d3-master-001 ~]# kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus-operator -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
prometheus-operator-ccc557658-bcn5j 1/1 Running 0 10s 10.244.39.10 d3-nginx-003 <none> <none>
最终服务部署情况:
[root@d3-master-001 ~]# kubectl get pods -n monitoring -o wide|grep -v ^node-exporter
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
alertmanager-main-0 2/2 Running 0 6m2s 10.244.43.5 d3-nginx-001 <none> <none>
alertmanager-main-1 2/2 Running 0 6m17s 10.244.41.4 d3-nginx-002 <none> <none>
grafana-8fdccb846-99hmq 1/1 Running 0 89m 10.244.39.4 d3-nginx-003 <none> <none>
kube-state-metrics-75c4c48669-fb8gg 4/4 Running 0 83m 10.244.39.6 d3-nginx-003 <none> <none>
prometheus-adapter-5b64874445-79fz6 1/1 Running 0 81m 10.244.39.8 d3-nginx-003 <none> <none>
prometheus-k8s-0 3/3 Running 1 13m 10.244.43.3 d3-nginx-001 <none> <none>
prometheus-k8s-1 3/3 Running 1 12m 10.244.41.3 d3-nginx-002 <none> <none>
prometheus-operator-ccc557658-bcn5j 1/1 Running 0 78m 10.244.39.10 d3-nginx-003 <none> <none>
Prometheus监控配置
kube-scheduler、kube-controller-manager监控配置
创建kube-scheduler和kube-controller-manager的Service,即可让Prometheus监控kube-scheduler和kube-controller-manager:
[root@d3-master-001 manifests]# kubectl apply -f - <<EOF
apiVersion: v1
kind: Service
metadata:
labels:
k8s-app: kube-scheduler
name: kube-scheduler
namespace: kube-system
spec:
ports:
- name: http-metrics
port: 10251
protocol: TCP
targetPort: 10251
selector:
component: kube-scheduler
type: ClusterIP
EOF
service/kube-scheduler created
[root@d3-master-001 manifests]# kubectl apply -f - <<EOF
apiVersion: v1
kind: Service
metadata:
labels:
k8s-app: kube-controller-manager
name: kube-controller-manager
namespace: kube-system
spec:
ports:
- name: http-metrics
port: 10251
protocol: TCP
targetPort: 10251
selector:
component: kube-controller-manager
type: ClusterIP
EOF
service/kube-controller-manager created
- 创建Service之后Targets中的“monitoring/kube-scheduler/0 (0/0 up)”变为“monitoring/kube-scheduler/0 (3/3 up)”,kube-controller-manager同理。
Kubernetes使用的etcd集群kube-etcd监控配置
1)配置etcd证书
a)创建kube-etcd-healthcheck-client证书
# 创建证书签名请求
[root@d3-master-001 kubernetes]# cat > cert/kube-etcd-healthcheck-client-csr.json <<"EOF"
{
"CN": "kube-etcd-healthcheck-client",
"hosts": [],
"key": {
"algo": "rsa",
"size": 2048
},
"names": [
{
"C": "CN",
"ST": "Jiangsu",
"L": "Suzhou",
"O": "k8s",
"OU": "aispeech"
}
]
}
EOF
# 生成证书和私钥
[root@d3-master-001 kubernetes]# cfssl gencert \
-ca=cert/etcd-ca.pem \
-ca-key=cert/etcd-ca-key.pem \
-config=cert/ca-config.json \
-profile=kubernetes \
cert/kube-etcd-healthcheck-client-csr.json| cfssljson -bare cert/kube-etcd-healthcheck-client
# 检查证书信息
[root@d3-master-001 kubernetes]# openssl x509 -noout -text -in cert/kube-etcd-healthcheck-client.pem |head -n 11
Certificate:
Data:
Version: 3 (0x2)
Serial Number:
7b:e8:31:6a:8c:51:0a:2d:9b:d4:36:c2:31:de:6c:55:c3:94:24:13
Signature Algorithm: sha256WithRSAEncryption
Issuer: C=CN, ST=Jiangsu, L=Suzhou, O=k8s, OU=aispeech, CN=etcd-ca
Validity
Not Before: Oct 20 02:32:00 2019 GMT
Not After : Oct 17 02:32:00 2029 GMT
Subject: C=CN, ST=Jiangsu, L=Suzhou, O=k8s, OU=aispeech, CN=kube-etcd-healthcheck-client
b)创建Secret/kube-etcd
[root@d3-master-001 kubernetes]# kubectl create secret -n monitoring generic kube-etcd \
--from-file=cert/etcd-ca.pem \
--from-file=cert/kube-etcd-healthcheck-client.pem \
--from-file=cert/kube-etcd-healthcheck-client-key.pem
secret/kube-etcd created
c)修改Prometheus/k8s添加Secret/kube-etcd
[root@d3-master-001 kubernetes]# kubectl edit prometheus -n monitoring k8s
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
labels:
prometheus: k8s
name: k8s
namespace: monitoring
spec:
...
secrets:
- kube-etcd
[root@d3-master-001 kubernetes]# kubectl get pods -n monitoring -l app=prometheus
NAME READY STATUS RESTARTS AGE
prometheus-k8s-0 0/3 Terminating 0 109m
prometheus-k8s-1 3/3 Running 0 23s
[root@d3-master-001 kubernetes]# kubectl exec -n monitoring prometheus-k8s-1 -c prometheus -- ls -l /etc/prometheus/secrets/kube-etcd
total 0
lrwxrwxrwx 1 root root 18 Oct 20 02:39 etcd-ca.pem -> ..data/etcd-ca.pem
lrwxrwxrwx 1 root root 43 Oct 20 02:39 kube-etcd-healthcheck-client-key.pem -> ..data/kube-etcd-healthcheck-client-key.pem
lrwxrwxrwx 1 root root 39 Oct 20 02:39 kube-etcd-healthcheck-client.pem -> ..data/kube-etcd-healthcheck-client.pem
[root@d3-master-001 kubernetes]# kubectl exec -n monitoring prometheus-k8s-1 -c prometheus -- find /etc/prometheus/secrets/kube-etcd/
/etc/prometheus/secrets/kube-etcd/
/etc/prometheus/secrets/kube-etcd/..data
/etc/prometheus/secrets/kube-etcd/kube-etcd-healthcheck-client.pem
/etc/prometheus/secrets/kube-etcd/kube-etcd-healthcheck-client-key.pem
/etc/prometheus/secrets/kube-etcd/etcd-ca.pem
/etc/prometheus/secrets/kube-etcd/..2019_10_20_02_39_40.688830209
/etc/prometheus/secrets/kube-etcd/..2019_10_20_02_39_40.688830209/kube-etcd-healthcheck-client.pem
/etc/prometheus/secrets/kube-etcd/..2019_10_20_02_39_40.688830209/kube-etcd-healthcheck-client-key.pem
/etc/prometheus/secrets/kube-etcd/..2019_10_20_02_39_40.688830209/etcd-ca.pem
2)创建ServiceMonitor 参考文档:https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md
[root@d3-master-001 kubernetes]# kubectl apply -f - <<EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: kube-etcd
name: kube-etcd
namespace: monitoring
spec:
jobLabel: k8s-app
endpoints:
- port: https
interval: 15s
scheme: https
tlsConfig:
caFile: /etc/prometheus/secrets/kube-etcd/etcd-ca.pem
certFile: /etc/prometheus/secrets/kube-etcd/kube-etcd-healthcheck-client.pem
keyFile: /etc/prometheus/secrets/kube-etcd/kube-etcd-healthcheck-client-key.pem
insecureSkipVerify: true
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
k8s-app: kube-etcd
EOF
servicemonitor.monitoring.coreos.com/kube-etcd created
- ServiceMonitor.spec.endpoints的部分参数解释:
Field Description Scheme Required
port Name of the service port this endpoint refers to. Mutually exclusive with targetPort. string false
targetPort Name or number of the target port of the endpoint. Mutually exclusive with port. *intstr.IntOrString false
path HTTP path to scrape for metrics. string false
scheme HTTP scheme to use for scraping. string false
params Optional HTTP URL parameters map[string][]string false
interval Interval at which metrics should be scraped string false
scrapeTimeout Timeout after which the scrape is ended string false
tlsConfig TLS configuration to use when scraping the endpoint *TLSConfig false
- ServiceMonitor.spec.endpoints.port必须和Service.spec.ports.name对应,否则Prometheus的Targets中永远是”monitoring/kube-etcd/0 (0/0 up)“,和没有创建Service一样。
3)创建Service/kube-etcd
kube-etcd非Kubernetes部署,使用自定义Endpoints的方式创建Service:
[root@d3-master-001 kubernetes]# kubectl apply -f - <<EOF
apiVersion: v1
kind: Service
metadata:
name: kube-etcd
namespace: monitoring
labels:
k8s-app: kube-etcd
spec:
ports:
- name: https
port: 2379
type: ClusterIP
clusterIP: None
---
apiVersion: v1
kind: Endpoints
metadata:
name: kube-etcd
namespace: monitoring
labels:
k8s-app: kube-etcd
subsets:
- addresses:
- ip: 10.24.10.74
nodeName: d3-master-001
- ip: 10.24.10.75
nodeName: d3-master-002
- ip: 10.24.10.76
nodeName: d3-master-003
ports:
- name: https
port: 2379
protocol: TCP
EOF
service/kube-etcd created
endpoints/kube-etcd created
4)打开Prometheus UI检查监控情况
- Prometheus UI -> Status -> Configuration 的 scrape_configs 检查是否存在“job_name: monitoring/kube-etcd/0”:
- job_name: monitoring/kube-etcd/0
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
scheme: https
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- monitoring
tls_config:
ca_file: /etc/prometheus/secrets/kube-etcd/etcd-ca.pem
cert_file: /etc/prometheus/secrets/kube-etcd/kube-etcd-healthcheck-client.pem
key_file: /etc/prometheus/secrets/kube-etcd/kube-etcd-healthcheck-client-key.pem
insecure_skip_verify: true
relabel_configs:
...
- Prometheus UI -> Status -> Targets 检查是否存在“monitoring/kube-etcd/0 (3/3 up)”,注意状态为“(3/3 up)”。
PS:更加简单的添加监控的方法,直接添加“static_configs”即可,但是怎么添加?
- job_name: 'etcd'
metrics_path: /metrics
scheme: https
tls_config:
cert_file: 'ssl/server.pem'
key_file: 'ssl/server-key.pem'
insecure_skip_verify: true
static_configs:
- targets:
- '172.25.50.16:2379'
- '172.25.50.17:2379'
- '172.25.50.18:2379'
labels:
group: 'etcd
业务使用的etcd集群daoker-etcd监控配置
- daoker-etcd没有使用https,无需配置etcd证书,只需创建ServiceMonitor和Service即可。
1)创建ServiceMonitor/daoker-etcd
[root@d3-master-001 ~]# kubectl apply -f - <<EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: daoker-etcd
name: daoker-etcd
namespace: monitoring
spec:
jobLabel: k8s-app
endpoints:
- port: http
interval: 15s
scheme: http
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
k8s-app: daoker-etcd
EOF
servicemonitor.monitoring.coreos.com/daoker-etcd created
2)创建Service/daoker-etcd
[root@d3-master-001 ~]# kubectl apply -f - <<EOF
apiVersion: v1
kind: Service
metadata:
name: daoker-etcd
namespace: monitoring
labels:
k8s-app: daoker-etcd
spec:
ports:
- name: http
port: 2381
type: ClusterIP
clusterIP: None
---
apiVersion: v1
kind: Endpoints
metadata:
name: daoker-etcd
namespace: monitoring
labels:
k8s-app: daoker-etcd
subsets:
- addresses:
- ip: 10.24.10.74
nodeName: d3-master-001
- ip: 10.24.10.75
nodeName: d3-master-002
- ip: 10.24.10.76
nodeName: d3-master-003
ports:
- name: http
port: 2381
protocol: TCP
EOF
service/daoker-etcd created
endpoints/daoker-etcd created
添加haproxy的监控
HAProxy 2.0.0及之后的版本可以支持Prometheus监控,自带Official Prometheus exporter,参考 https://github.com/prometheus/haproxy_exporter :
- As of 2.0.0, HAProxy includes a Prometheus exporter module that can be built into your binary during build time.
- To build with the official Prometheus exporter module, make with the following EXTRA_OBJS flag:
make TARGET=linux-glibc EXTRA_OBJS="contrib/prometheus-exporter/service-prometheus.o"
- Once built, you can enable and configure the Prometheus endpoint from your haproxy.cfg file as a typical frontend:
frontend stats
bind *:8404
http-request use-service prometheus-exporter if { path /metrics }
stats enable
stats uri /stats
stats refresh 10s
a)检查haproxy的metrics接口
//ingress-haproxy
[root@d3-master-001 kubernetes]# curl 10.24.10.113:9000/metrics -I
HTTP/1.1 200 OK
cache-control: no-cache
content-type: text/plain; version=0.0.4
transfer-encoding: chunked
connection: close
//apiserver-haproxy
[root@d3-master-001 kubernetes]# curl 10.24.10.114:8441/metrics -I
HTTP/1.1 200 OK
cache-control: no-cache
content-type: text/plain; version=0.0.4
transfer-encoding: chunked
connection: close
b)添加Prometheus监控 (kube-prometheus中无法实现)修改secret/prometheus-k8s的prometheus.yaml
[root@d3-master-001 kubernetes]# kubectl get secret -n monitoring prometheus-k8s -o yaml|grep prometheus.yaml.gz|awk '{print $2}'|base64 -d|gzip -d
global:
evaluation_interval: 30s
scrape_interval: 30s
external_labels:
prometheus: monitoring/k8s
prometheus_replica: $(POD_NAME)
rule_files:
- /etc/prometheus/rules/prometheus-k8s-rulefiles-0/*.yaml
scrape_configs:
...
alerting:
...
[root@d3-master-001 kubernetes]# kubectl get secret -n monitoring prometheus-k8s -o yaml|grep prometheus.yaml.gz|awk '{print $2}' > prometheus.yaml.gz.bak
[root@d3-master-001 kubernetes]# kubectl get secret -n monitoring prometheus-k8s -o yaml|\
grep prometheus.yaml.gz|awk '{print $2}'|\
base64 -d|gzip -d|\
sed "/scrape_configs:/a\
- job_name: monitoring\/ingress-haproxy\/0 \n\
static_configs: \n\
- targets: ['10.24.10.113:9000'] \n\
labels: \n\
alias: ingress-haproxy \n\
- job_name: monitoring\/apiserver-haproxy\/0 \n\
static_configs: \n\
- targets: ['10.24.10.114:8441'] \n\
labels: \n\
alias: apiserver-haproxy"|\
gzip > prometheus.yaml.gz
[root@d3-master-001 kubernetes]# kubectl create secret generic -n monitoring prometheus-k8s --from-file=prometheus.yaml.gz --dry-run -o yaml|kubectl apply -f -
Warning: kubectl apply should be used on resource created by either kubectl create --save-config or kubectl apply
secret/prometheus-k8s configured
[root@d3-master-001 kubernetes]# kubectl get secret -n monitoring prometheus-k8s -o yaml|grep prometheus.yaml.gz|awk '{print $2}'|base64 -d|gzip -d|grep haproxy
[root@d3-master-001 kubernetes]# kubectl create secret generic prometheus-k8s -n monitoring --from-file=prometheus.yaml.gz --dry-run -o yaml|kubectl replace -f -
secret/prometheus-k8s replaced
[root@d3-master-001 kubernetes]# kubectl get secret -n monitoring prometheus-k8s -o yaml|grep prometheus.yaml.gz|awk '{print $2}'|base64 -d|gzip -d|grep haproxy
[root@d3-master-001 kubernetes]# kubectl delete secrets -n monitoring prometheus-k8s
secret "prometheus-k8s" deleted
[root@d3-master-001 kubernetes]# kubectl get secret -n monitoring prometheus-k8s
NAME TYPE DATA AGE
prometheus-k8s Opaque 1 16s
- secret/prometheus-k8s根本不让修改!!!难道必须使用ServiceMonitor + Service实现?
创建Service:
[root@d3-master-001 kubernetes]# kubectl apply -f - <<EOF
apiVersion: v1
kind: Service
metadata:
labels:
haproxy: apiserver-haproxy
name: apiserver-haproxy
namespace: monitoring
spec:
clusterIP: None
ports:
- name: http-metrics
port: 8441
protocol: TCP
type: ClusterIP
---
apiVersion: v1
kind: Endpoints
metadata:
labels:
haproxy: apiserver-haproxy
name: apiserver-haproxy
namespace: monitoring
subsets:
- addresses:
- ip: 10.24.10.114
nodeName: apiserver-vip
ports:
- name: http-metrics
port: 8441
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
labels:
haproxy: ingress-haproxy
name: ingress-haproxy
namespace: monitoring
spec:
clusterIP: None
ports:
- name: http-metrics
port: 9000
protocol: TCP
type: ClusterIP
---
apiVersion: v1
kind: Endpoints
metadata:
labels:
haproxy: ingress-haproxy
name: ingress-haproxy
namespace: monitoring
subsets:
- addresses:
- ip: 10.24.10.113
nodeName: ingress-vip
ports:
- name: http-metrics
port: 9000
protocol: TCP
EOF
service/apiserver-haproxy configured
endpoints/apiserver-haproxy created
service/ingress-haproxy configured
endpoints/ingress-haproxy created
创建ServiceMonitor:
[root@d3-master-001 kubernetes]# kubectl apply -f - <<EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: haproxy
namespace: monitoring
labels:
k8s-apps: haproxy
spec:
jobLabel: k8s-apps
selector:
matchExpressions:
- {key: haproxy, operator: Exists}
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: http-metrics
interval: 15s
EOF
servicemonitor.monitoring.coreos.com/haproxy created
- 参考:https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/running-exporters.md
- 这里使用一个ServiceMonitor选择两个Service,符合prometheus-operator的宗旨:The goal for one ServiceMonitor should be to cover a large number of Services. This can be achieved by creating a generic ServiceMonitor.
- 两个Haproxy的指标示例: haproxy_process_requests_total{endpoint="http-metrics",instance="10.24.10.114:8441",job="apiserver-haproxy",namespace="monitoring",service="apiserver-haproxy"} haproxy_process_requests_total{endpoint="http-metrics",instance="10.24.10.113:9000",job="ingress-haproxy",namespace="monitoring",service="ingress-haproxy"}
Alertmanager告警配置(待完成)
仅实现集群核心组件的告警,服务告警由opsmind实现。
Grafana图表配置(待优化)
仅添加集群核心组件的图表,服务的图表由http://grafana.aispeech.com.cn/实现。
1)kube-etcd和daoker-etcd Grafana创建文件夹:Applications https://grafana.com/grafana/dashboards/3070
2)apiserver-haproxy和ingress-haproxy https://grafana.com/grafana/dashboards/10225