apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: labels: prometheus: k8s role: alert-rules name: alertmanager-common namespace: victoria spec: groups: - name: common rules: - alert: 域名注册到期时间小于30天 annotations: description: 域名注册到期时间小于30天 summary: '{{ $value }}' expr: >- round(sum by(company, domain_name)((expiretime - time()) /3600 / 24)) < 30 for: 30m labels: severity: warning - alert: ssl证书到期时间小于7天 annotations: description: ssl证书到期时间小于7天 summary: '{{ $value }}' expr: >- round(sum by(env,job,service,url)((probe_ssl_earliest_cert_expiry-time()) /3600/24)) < 7 for: 1h labels: severity: warning - alert: 域名接口状态码返回异常 annotations: description: 域名接口状态码返回异常 summary: '{{ $value }}' expr: >- sum by (k8scluster, env, group, service, uri) (probe_http_status_code !=200 and probe_http_status_code !=404 and probe_http_status_code !=403) for: 15m labels: severity: warning - alert: 物理机磁盘状态异常 annotations: description: 物理机磁盘状态异常 summary: '{{ $value }}' expr: 'sum by(hostname,hostip,slot,Size)(node_disk_status) != 1' for: 1m labels: severity: warning - alert: 磁盘media错误过高 annotations: description: 磁盘media错误过高 summary: '{{ $value }}' expr: 'sum by(hostname,hostip,slot)(node_disk_media_error) > 10' for: 1m labels: severity: warning - alert: 磁盘other错误过高 annotations: description: 磁盘other错误过高 summary: '{{ $value }}' expr: 'sum by(hostname,hostip,slot)(node_disk_other_error) > 10' for: 1m labels: severity: warning - alert: 系统磁盘挂载点读写异常 annotations: description: 系统磁盘挂载点读写异常 summary: '{{ $value }}' expr: 'sum by(device,fs,mountpoint,hostname,hostip)(node_disk_mount) == 0' for: 3m labels: severity: warning - alert: 系统磁盘fstab卷丢失 annotations: description: 系统磁盘fstab卷丢失 summary: '{{ $value }}' expr: >- sum by(device,mountpoint,hostname,hostip)(node_disk_volume_loss) == 0 for: 0m labels: severity: warning - alert: 系统磁盘IO负载过高 annotations: description: 系统磁盘IO负载过高 summary: '{{ $value }}' expr: 'round(sum by(hostname,hostip,device)(node_disk_ioutil{})) > 99' for: 5m labels: severity: warning - alert: 系统load5过高 annotations: description: 系统load5过高 summary: '{{ $value }}' expr: >- round(sum by (hostname,hostip)(node_load5 / node_cpu_core),0.01) >1.5 for: 3m labels: severity: warning - alert: 系统用户被修改 annotations: description: 系统用户被修改 summary: '{{ $value }}' expr: 'sum by (hostip,hostname,username)(delta(node_shadow_md5[5m])) != 0' for: 0m labels: severity: emergency - alert: 系统用户修改详情 annotations: description: 系统用户修改详情 summary: '{{ $value }}' expr: >- sum by (hostip,hostname,username,action)(node_systemuser_status) == 1 for: 0m labels: severity: emergency - alert: 物理机离线 annotations: description: 物理机离线 summary: '{{ $value }}' expr: 'sum by(hostip,instance,k8scluster)(up{job=~"base-exporter-.*"}) == 0' for: 3m labels: severity: warning - alert: 系统5分钟内发生重启 annotations: description: 系统5分钟内发生重启(请结合系统离线告警判断) summary: '{{ $value }}' expr: 'round(sum by(hostname,hostip,k8scluster)(node_uptime) / 60) < 5' for: 1m labels: severity: emergency - alert: 物理机CPU使用率过高 annotations: description: 物理机CPU使用率过高 summary: '{{ $value }}' expr: 'round(sum by(hostname,hostip)(node_cpu_usage_total{})*100,0.01) > 80' for: 3m labels: severity: warning - alert: 物理机大于1T的磁盘空间不足5% annotations: description: 物理机大于1T的磁盘空间不足5% summary: '{{ $value }}' expr: >- round(sum by(hostname,hostip,device,mountpoint)((node_disk_usage and on(hostname,device) (node_disk_total>1099511627776)))*100,0.01) >95 for: 3m labels: severity: warning - alert: 物理机小于1T的磁盘空间不足20% annotations: description: 物理机小于1T的磁盘空间不足20% summary: '{{ $value }}' expr: >- round(sum by(hostname,hostip,device,mountpoint)((node_disk_usage and on(hostname,device) (node_disk_total<1099511627777)))*100,0.01) >80 for: 3m labels: severity: warning - alert: 物理机磁盘根目录空间不足20% annotations: description: 物理机磁盘根目录空间不足20% summary: '{{ $value }}' expr: >- round(sum by (hostname, hostip, device, mountpoint) (node_disk_usage{mountpoint="/"})*100,0.01) > 80 for: 3m labels: severity: warning --- apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: labels: prometheus: k8s role: alert-rules name: alertmanager-k8s namespace: victoria resourceVersion: '94322250' spec: groups: - name: k8s rules: - alert: K8S集群Node节点NotReady annotations: description: K8S集群Node节点NotReady summary: '{{ $value }}' expr: >- sum by (k8scluster,node)(kube_node_status_condition{condition="Ready",status="true"}) == 0 for: 1m labels: severity: warning - alert: K8S集群资源短缺 annotations: description: K8S集群资源短缺 summary: '{{ $value }}' expr: >- sum by(condition,k8scluster,node,status)(kube_node_status_condition{condition=~"OutOfDisk|MemoryPressure|DiskPressure",status!="false"}) ==1 for: 1m labels: severity: warning - alert: K8S集群PVC空间不足20% annotations: description: PVC空间不足20% summary: '{{ $value }}' expr: >- round(sum by (namespace, persistentvolumeclaim, k8scluster) (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) * 100, 0.01) < 20 for: 1m labels: severity: warning - alert: K8S集群15分钟内有Pod重启 annotations: description: K8S集群15分钟内有Pod重启 summary: '{{ $value }}' expr: >- sum by (container, k8scluster, namespace, pod) (delta(kube_pod_container_status_restarts_total{pod!~"kuboard-pv-browser.*"}[15m])) !=0 for: 1m labels: severity: warning - alert: K8S集群Statefulset副本异常 annotations: description: K8S集群Statefulset副本异常 summary: '{{ $value }}' expr: >- sum by(k8scluster,namespace,statefulset)(kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas) for: 1m labels: severity: warning --- apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: labels: prometheus: k8s role: alert-rules name: alertmanager-network namespace: victoria resourceVersion: '94358078' spec: groups: - name: network rules: - alert: 网络设备接口异常DOWN掉 annotations: description: 1表示UP,2表示DOWN summary: '{{ $value }}' expr: >- sum by (ifIndex, ifName, instance, project) (ifOperStatus and on (ifIndex, ifName, instance, project) avg_over_time(ifOperStatus[1h]) <2) != 1 for: 0s labels: severity: warning - alert: 网络设备电源状态异常 annotations: description: 网络设备电源状态异常 summary: '{{ $value }}' expr: 'sum by(hh3cDevMPowerNum,instance,project)(hh3cDevMPowerStatus) !=1' for: 3m labels: severity: warning - alert: 网络设备风扇状态异常 annotations: description: 网络设备电源状态异常 summary: '{{ $value }}' expr: 'sum by(hh3cDevMFanNum,instance,project)(hh3cDevMFanStatus) !=1' for: 3m labels: severity: warning - alert: 网络设备流出带宽超过300Mbps annotations: description: 网络设备流出带宽超过300Mbps summary: '{{ $value }}' expr: >- round(sum by(ifIndex,ifName,instance)(irate(ifHCOutOctets[5m]) /1024/1024),0.01) >300 for: 3m labels: severity: warning - alert: 网络设备流入带宽超过300Mbps annotations: description: 网络设备流入带宽超过300Mbps summary: '{{ $value }}' expr: >- round(sum by(ifIndex,ifName,instance)(irate(ifHCInOctets[5m]) /1024/1024),0.01) >300 for: 3m labels: severity: warning - alert: 网络设备5分钟内发生重启 annotations: description: 网络设备5分钟内发生重启(请结合设备离线告警判断) summary: '{{ $value }}' expr: 'round(sum by (instance,project)(sysUpTime /100 /60)) <5' for: 1m labels: severity: warning --- apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: labels: prometheus: k8s role: alert-rules name: alertmanager-vmvare namespace: victoria resourceVersion: '93865900' spec: groups: - name: vmvare rules: - alert: EXSi主机离线 annotations: description: EXSi主机离线超过5分钟 summary: '{{ $value }}' expr: 'sum by (dc_name,host_name)(vmware_host_power_state) !=1' for: 5m labels: severity: warning - alert: EXSi主机CPU使用情况 annotations: description: EXSi主机CPU使用率超90% summary: '{{ $value }}' expr: >- sum by (dc_name,host_name)(vmware_host_cpu_usage / vmware_host_cpu_max)*100 >95 for: 5m labels: severity: warning - alert: EXSi主机内存使用情况 annotations: description: EXSi主机内存使用率超90% summary: '{{ $value }}' expr: >- sum by (dc_name,host_name)(vmware_host_memory_usage/ vmware_host_memory_max)*100 >99 for: 5m labels: severity: warning - alert: EXSi主机磁盘容量情况 annotations: description: EXSi主机磁盘容量使用率超90% summary: '{{ $value }}' expr: >- sum by (dc_name,ds_name)((vmware_datastore_capacity_size- vmware_datastore_freespace_size) / vmware_datastore_capacity_size)*100 >99 for: 5m labels: severity: warning --- apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: labels: prometheus: k8s role: alert-rules name: alertmanager-wfm namespace: victoria resourceVersion: '95125220' spec: groups: - name: wfm rules: - alert: 微付猫MongoDB连接数小于10 annotations: description: 微付猫MongoDB连接数小于10 summary: '{{ $value }}' expr: >- sum by(env,state)(mongodb_connections{env="mongo-wfm-prod",state="current"}) <10 for: 10s labels: severity: warning - alert: 微付猫podCPU使用率超过70% annotations: description: 微付猫podCPU使用率超过70% summary: '{{ $value }}' expr: >- round(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{k8scluster="d1-prod", namespace="wfm"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="wfm", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)/sum(kube_pod_container_resource_limits{job="kube-state-metrics", k8scluster="d1-prod", namespace="wfm", resource="cpu"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="wfm", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)*100,0.01) >70 for: 10s labels: severity: warning - alert: 微付猫pod内存使用率超过70% annotations: description: 微付猫pod内存使用率超过70% summary: '{{ $value }}' expr: >- round(sum(container_memory_working_set_bytes{k8scluster="d1-prod", namespace="wfm", container!="", image!=""} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="wfm", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)/sum(kube_pod_container_resource_limits{job="kube-state-metrics", k8scluster="d1-prod", namespace="wfm", resource="memory"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="wfm", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)*100,0.01) >70 for: 10s labels: severity: warning --- apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: labels: prometheus: k8s role: alert-rules name: alertmanager-xsf namespace: victoria resourceVersion: '95125390' spec: groups: - name: xsf rules: - alert: 新闪付机器离线 annotations: description: 新闪付机器离线 summary: '{{ $value }}' expr: >- sum by(hostip,instance)(up{k8scluster="d3-prod",job=~"base-exporter-.*"}) != 1 for: 3m labels: severity: warning - alert: 新闪付机器CPU使用率超过70% annotations: description: 新闪付机器CPU使用率超过70% summary: '{{ $value }}' expr: >- round(sum by(hostname,hostip)(node_cpu_usage{k8scluster="d3-prod",job=~"base-exporter-.*"})*100,0.01) > 70 for: 3m labels: severity: warning - alert: 新闪付机器磁盘空间不足20% annotations: description: 新闪付机器磁盘空间不足20% summary: '{{ $value }}' expr: >- round(sum by (hostname, hostip, device, mountpoint)(node_disk_usage{k8scluster="d3-prod",job=~"base-exporter-.*"} * 100),0.01) > 80 for: 30s labels: severity: warning - alert: 新闪付机服务Pod运行异常 annotations: description: 新闪付机服务Pod运行异常 summary: '{{ $value }}' expr: >- (sum by(pod)(kube_pod_status_ready{job="kube-state-metrics",k8scluster="d3-prod",namespace="xsf",condition="true"} and on (namespace,pod) kube_pod_status_phase{phase="Running"})) == 0 for: 1m labels: severity: warning - alert: 新闪付机服务Pod异常重启 annotations: description: 新闪付机服务Pod异常重启 summary: '{{ $value }}' expr: >- sum by(namespace,pod)(round(delta(kube_pod_container_status_restarts_total{job="kube-state-metrics",k8scluster="d3-prod",namespace=~"xsf"}[10m]))) != 0 for: 1m labels: severity: warning - alert: 新闪付podCPU使用率超过70% annotations: description: 新闪付podCPU使用率超过70% summary: '{{ $value }}' expr: >- round(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{k8scluster="d3-prod", namespace="xsf"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d3-prod", namespace="xsf", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)/sum(kube_pod_container_resource_limits{job="kube-state-metrics", k8scluster="d3-prod", namespace="xsf", resource="cpu"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d3-prod", namespace="xsf", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)*100,0.01) >70 for: 10s labels: severity: warning - alert: 新闪付pod内存使用率超过70% annotations: description: 新闪付pod内存使用率超过70% summary: '{{ $value }}' expr: >- round(sum(container_memory_working_set_bytes{k8scluster="d3-prod", namespace="xsf", container!="", image!=""} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d3-prod", namespace="xsf", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)/sum(kube_pod_container_resource_limits{job="kube-state-metrics", k8scluster="d3-prod", namespace="xsf", resource="memory"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d3-prod", namespace="xsf", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)*100,0.01) >70 for: 10s labels: severity: warning --- apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: labels: prometheus: k8s role: alert-rules name: alertmanager-uen namespace: victoria resourceVersion: '95125277' spec: groups: - name: uen rules: - alert: 尤恩podCPU使用率超过70% annotations: description: 尤恩podCPU使用率超过70% summary: '{{ $value }}' expr: >- round(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{k8scluster="d1-prod", namespace="uen"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="uen", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)/sum(kube_pod_container_resource_limits{job="kube-state-metrics", k8scluster="d1-prod", namespace="uen", resource="cpu"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="uen", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)*100,0.01) >70 for: 10s labels: severity: warning - alert: 尤恩pod内存使用率超过70% annotations: description: 尤恩pod内存使用率超过70% summary: '{{ $value }}' expr: >- round(sum(container_memory_working_set_bytes{k8scluster="d1-prod", namespace="uen", container!="", image!=""} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="uen", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)/sum(kube_pod_container_resource_limits{job="kube-state-metrics", k8scluster="d1-prod", namespace="uen", resource="memory"} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{k8scluster="d1-prod", namespace="uen", workload=~".*", workload_type=~".*"}) by (k8scluster,namespace,pod)*100,0.01) >70 for: 10s labels: severity: warning