#!/bin/bash # 通用单位转换函数 convert_to_bytes() { local value="$1" local number=$(echo "$value" | grep -o -E '[0-9]+(\.[0-9]+)?') local unit=$(echo "$value" | grep -o -E '[a-zA-Z]+' | tr '[:lower:]' '[:upper:]') case "$unit" in GB) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024}" ;; MB) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024}" ;; KB) awk "BEGIN {printf \"%.0f\", $number * 1024}" ;; B) echo $number ;; TB) awk "BEGIN {printf \"%.0f\", $number * 1024 * 1024 * 1024 * 1024}" ;; *) echo "Invalid unit: $unit" exit 1 ;; esac } # 监控机器型号 model=($( dmidecode -t 1 | grep -E "Manufacturer|Product Name" | awk -F"[: ]+" 'NR==1 {first=$2"_"$3; next}{print first"_"$3"_"$4}' )) sn=($( dmidecode -t 1 | grep -E "Serial Number" | awk -F"[: ]+" '{print $3}' )) echo "node_machine_info{model=\"$model\",sn=\"$sn\"}" 1 # 监控出口公网IP echo $(timeout 2 curl -s ifconfig.me | awk '{ip=$1; gsub(/\./, "", ip); print "node_snat_ip{ip=\"" $1 "\"} " ip}') # 监控物理内存大小及槽位 locators=($( dmidecode -t 17|grep "Locator:"|grep -v Bank|awk '{print $2$3}' )) sizes=($( dmidecode -t 17|grep "Size:"|grep -Ev "Non-Volatile|Volatile|Cache|Logical"|awk '{print $2$3}' )) for i in "${!locators[@]}"; do locator=${locators[$i]}; size=${sizes[$i]}; if [[ $size == *"B"* ]];then bytes=($( convert_to_bytes $size )); echo "node_mem_info{locator=\"$locator\"} $bytes"; fi; done # 监控挂载点读写正常与否 awk '$3 ~ /^(ext2|ext3|ext4|xfs|nfs|cifs|ossfs|glusterfs)$/' /proc/mounts | \ egrep -v "kubelet|run|tmp|nfsd|rpc_pipefs|overlay2|devicemapper" | \ while read -r device mountpoint fs _; do status=$(timeout 5 sh -c "touch \"$mountpoint/.disk.tmp\" && rm -f \"$mountpoint/.disk.tmp\"" &>/dev/null && echo "1" || echo "0") echo "node_disk_mount{device=\"$device\",fstype=\"$fs\",mountpoint=\"$mountpoint\"} $status" done # 监控挂载点是否丢失 awk '$2 != "/" && $2 != "swap" && $2 != "" && !/^#/ {print $2}' /etc/fstab | while read -r mountpoint; do if ! grep -q "$mountpoint" /proc/mounts; then awk -v mp="$mountpoint" '$2 == mp {print "node_disk_volume_loss{device=\"" $1 "\",mountpoint=\"" $2 "\"} 0"}' /etc/fstab fi done # 监控物理磁盘 command -v lspci >/dev/null 2>&1 || yum -y install pciutils if [ -f "/opt/MegaRAID/MegaCli/MegaCli64" ]; then slot_numbers=($( /opt/MegaRAID/MegaCli/MegaCli64 -PDList -aALL -NoLog | grep "Slot Number" | awk '{print $3}' )) raw_sizes=($( /opt/MegaRAID/MegaCli/MegaCli64 -PDList -aALL -NoLog | grep "Raw" | awk '{print $3 $4}' )) firmware_states=($( /opt/MegaRAID/MegaCli/MegaCli64 -PDList -aALL -NoLog | grep "Firmware state" | awk -F"[ ,]+" '{print $3}' )) media_errors=($( /opt/MegaRAID/MegaCli/MegaCli64 -PDList -aALL -NoLog | grep "Media Error Count" | awk '{print $4}' )) other_errors=($( /opt/MegaRAID/MegaCli/MegaCli64 -PDList -aALL -NoLog | grep "Other Error Count" | awk '{print $4}' )) predictive_failures=($( /opt/MegaRAID/MegaCli/MegaCli64 -PDList -aALL -NoLog | grep "Predictive Failure Count" | awk '{print $4}' )) for i in "${!slot_numbers[@]}"; do slot=${slot_numbers[$i]} size=${raw_sizes[$i]} state=${firmware_states[$i]} media_error=${media_errors[$i]} other_error=${other_errors[$i]} predictive_failure=${predictive_failures[$i]} if [[ "$state" == "Online" || "$state" == "JBOD" || "$state" == "Hotspare" || "$state" == "Rebuild" ]]; then status=1 else status=0 fi echo "node_disk_status{slot=\"$slot\",size=\"$size\"} $status" echo "node_disk_media_error{slot=\"$slot\",size=\"$size\"} $media_error" echo "node_disk_other_error{slot=\"$slot\",size=\"$size\"} $other_error" echo "node_disk_predictive_failure{slot=\"$slot\",size=\"$size\"} $predictive_failure" done else rpm -ivh https://zhengyu1992.cn/file/software/DellRaid/MegaCli-8.07.14-1.noarch.rpm fi # 监控raid组 if [ -f "/opt/MegaRAID/perccli/perccli64" ]; then # Extract RAID information using perccli64 raid_names=($( /opt/MegaRAID/perccli/perccli64 /c0/vall show all | grep "/c0" | awk -F"[/: ]+" '{print $2$3}' )) raid_types=($( /opt/MegaRAID/perccli/perccli64 /c0/vall show all | grep "RAID" | awk '{print $2}' )) raid_sizes=($( /opt/MegaRAID/perccli/perccli64 /c0/vall show all | grep "RAID" | awk '{print $9$10}' )) raid_naaids=($( /opt/MegaRAID/perccli/perccli64 /c0/vall show all | grep "SCSI NAA Id" | awk '{print $5}' )) for i in "${!raid_names[@]}"; do # number=$(echo ${raid_names[$i]}|awk -F"c0v" '{print $2}') # slot=$(/opt/MegaRAID/perccli/perccli64 /c0/vall show all | awk -v num=$number '$1 ~ /^32:/ && $4 == num {gsub(/:/, "", $1);print substr($1, 3)}' | paste -sd '+' -) vdnumber=$( echo ${raid_names[$i]}|awk -F"c0v" '{print $2}') dgnumber=$( /opt/MegaRAID/perccli/perccli64 /c0/vall show all|sed -n '/DG\/VD/{n; n; p;}'|awk -v num=$vdnumber -F"[/ ]+" '$2 == num {print $1}' ) slot=$( /opt/MegaRAID/perccli/perccli64 /c0/vall show all | awk -v num=$dgnumber '$1 ~ /^(32:|64:|128:)/ && $4 == num {gsub(/:/, "", $1);print substr($1, 3)}' | paste -sd '+' - ) type=${raid_types[$i]} size=$(convert_to_bytes ${raid_sizes[$i]}) naaid=${raid_naaids[$i]} echo "node_raid_info{number=\"virtualdrive$number\",slot=\"slot$slot\",type=\"$type\",serial=\"$naaid\"} $size" done else rpm -ivh https://zhengyu1992.cn/file/software/DellRaid/perccli-007.0127.0000.0000-1.noarch.rpm fi # 监控tcp连接数 process_num=$(ps aux | wc -l) echo "node_process_num{} $process_num" command -v netstat >/dev/null 2>&1 || yum -y install iproute net-tools tcp_states=$(netstat -nt | awk 'NR>2{print $NF}' | sort -u) print_tcp_connection_count() { local status="$1" local count=$(netstat -nt | awk -v state="$status" '$NF == state {count++} END {print count}') echo "node_tcp_count{status=\"$status\"} $count" } for state in $tcp_states; do print_tcp_connection_count "$state" done # 监控ssh免密登陆文件 calculate_md5() { local filepath="$1" if [ -f "$filepath" ]; then md5sum "$filepath" | cut -d" " -f1 | tr -d "a-zA-Z" else echo "0" fi } md5root=$(calculate_md5 "/root/.ssh/authorized_keys") echo "node_authorized_keys{username=\"root\"} $md5root" awk -F":" '$3>=1000 && $3<=10000 && $NF!~"/sbin/nologin" {print $1}' /etc/passwd | while read -r user; do md5user=$(calculate_md5 "/home/$user/.ssh/authorized_keys") echo "node_authorized_keys{username=\"$user\"} $md5user" done # 监控用户新增删除和修改密码 md5shadow=`cat /etc/shadow |md5sum|cut -d" " -f1|tr -d "a-zA-Z"` if [ ! -f /var/base/.md5shadow ]||[ ! -f /var/base/.userlistold ]||[ ! -f /var/base/.passwdlistold ];then echo $md5shadow >/var/base/.md5shadow cat /etc/shadow|awk -F":" '{print $1}' >/var/base/.userlistold cat /etc/shadow|awk -F":" '{if($2!="*"&&($2!="!!"))print $1" "$2}' >/var/base/.passwdlistold else echo "node_shadow_md5{} $md5shadow" if [ "x$md5shadow" != "x$(cat /var/base/.md5shadow)" ];then cat /etc/shadow|awk -F":" '{print $1}' >/var/base/.userlistnew cat /etc/shadow|awk -F":" '{if($2!="*"&&($2!="!!"))print $1" "$2}' >/var/base/.passwdlistnew cat /var/base/.userlistnew /var/base/.userlistold|sort|uniq -d >/var/base/.userlisttmp cat /var/base/.userlistnew /var/base/.userlisttmp|sort|uniq -u >/var/base/.useradd cat /var/base/.userlistold /var/base/.userlisttmp|sort|uniq -u >/var/base/.userdel cat /var/base/.passwdlistnew /var/base/.passwdlistold|sort|uniq -d >/var/base/.passwdlisttmp cat /var/base/.passwdlistnew /var/base/.passwdlisttmp|sort|uniq -u>/var/base/.passwdchange if [[ -s /var/base/.useradd ]];then for adduser in `cat /var/base/.useradd` do echo "node_systemuser_status{username=\"$adduser\",action=\"adduser\"} 1" done fi if [[ -s /var/base/.userdel ]];then for deluser in `cat /var/base/.userdel` do echo "node_systemuser_status{username=\"$deluser\",action=\"deleteuser\"} 1" done fi if [[ -s /var/base/.passwdchange ]];then cat /var/base/.passwdchange|while read n do changeuser=`echo $n|awk '{print $1}'` echo "node_systemuser_status{username=\"$changeuser\",action=\"changepasswd\"} 1" done fi fi echo $md5shadow >/var/base/.md5shadow cat /etc/shadow|awk -F":" '{print $1}' >/var/base/.userlistold cat /etc/shadow|awk -F":" '{if($2!="*"&&($2!="!!"))print $1" "$2}' >/var/base/.passwdlistold fi # 监控容器运行时 dockerstatus=`which docker >/dev/null 2>&1 && echo "1" || echo "0"` ctrstatus=`which ctr >/dev/null 2>&1 && echo "1" ||echo "0"` if [ $dockerstatus == "1" ];then echo "node_container_runtime{runtime=\"docker\"} $dockerstatus" timeout 5 docker ps |grep -Ev "pause|CONTAINER"|awk '{if($NF~/k8s/)print $NF}'|awk -F"_" '{print "node_k8s_service{servicename=\""$2"\",podname=\""$3"\",namespace=\""$4"\"} 1"}' elif [ $dockerstatus == "0" ] && [ $ctrstatus == "1" ];then echo "node_container_runtime{runtime=\"containerd\"} $ctrstatus" timeout 5 crictl ps|grep -v "CONTAINER"|awk '{print "node_k8s_service{servicename=\""$7"\",podname=\""$10"\"} 1"}' else echo "node_container_runtime{runtime=\"null\"} 0" fi # 监控GPU nvidia-smi &>/dev/null if [[ $? -eq 0 ]];then processnum=$((`nvidia-smi|grep -A 10 Processes|wc -l` - 4)) if [[ ${processnum} -gt 0 ]];then echo "node_gpu_process_num{} $processnum" basedir="$(cd "$(dirname "${BASH_SOURCE}")" && pwd)" python2 $basedir/gpu-monitor.py fi else break fi