#!/usr/bin/env bash # Copyright 2021 The Rook Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -xeEo pipefail ############# # VARIABLES # ############# function find_extra_block_dev() { # shellcheck disable=SC2005 # redirect doesn't work with sudo, so use echo echo "$(sudo lsblk)" >/dev/stderr # print lsblk output to stderr for debugging in case of future errors # relevant lsblk --pairs example: (MOUNTPOINT identifies boot partition)(PKNAME is Parent dev ID) # NAME="sda15" SIZE="106M" TYPE="part" MOUNTPOINT="/boot/efi" PKNAME="sda" # NAME="sdb" SIZE="75G" TYPE="disk" MOUNTPOINT="" PKNAME="" # NAME="sdb1" SIZE="75G" TYPE="part" MOUNTPOINT="/mnt" PKNAME="sdb" boot_dev="$(sudo lsblk --noheading --list --output MOUNTPOINT,PKNAME | grep boot | awk '{print $2}')" echo " == find_extra_block_dev(): boot_dev='$boot_dev'" >/dev/stderr # debug in case of future errors # --nodeps ignores partitions extra_dev="$(sudo lsblk --noheading --list --nodeps --output KNAME | grep -v loop | grep -v "$boot_dev" | head -1)" echo " == find_extra_block_dev(): extra_dev='$extra_dev'" >/dev/stderr # debug in case of future errors echo "$extra_dev" # output of function } : "${BLOCK:=$(find_extra_block_dev)}" # by definition, in this file, BLOCK should only contain the "sdX" portion of the block device name # some external scripts export BLOCK as the full "/dev/sdX" path, which this script must handle BLOCK="$(basename $BLOCK)" NETWORK_ERROR="connection reset by peer" SERVICE_UNAVAILABLE_ERROR="Service Unavailable" INTERNAL_ERROR="INTERNAL_ERROR" INTERNAL_SERVER_ERROR="500 Internal Server Error" ############# # FUNCTIONS # ############# function install_deps() { sudo wget https://github.com/mikefarah/yq/releases/download/3.4.1/yq_linux_amd64 -O /usr/local/bin/yq sudo chmod +x /usr/local/bin/yq } function print_k8s_cluster_status() { kubectl cluster-info kubectl get pods -n kube-system } function prepare_loop_devices() { if [ $# -ne 1 ]; then echo "usage: $0 loop_deivce_count" exit 1 fi OSD_COUNT=$1 if [ $OSD_COUNT -le 0 ]; then echo "Invalid OSD_COUNT $OSD_COUNT. OSD_COUNT must be larger than 0." exit 1 fi for i in $(seq 1 $OSD_COUNT); do sudo dd if=/dev/zero of=~/data${i}.img bs=1M seek=6144 count=0 sudo losetup /dev/loop${i} ~/data${i}.img done sudo lsblk } function use_local_disk() { BLOCK_DATA_PART="/dev/${BLOCK}1" sudo apt purge snapd -y sudo dmsetup version || true sudo swapoff --all --verbose if mountpoint -q /mnt; then sudo umount /mnt # search for the device since it keeps changing between sda and sdb sudo wipefs --all --force "$BLOCK_DATA_PART" else # it's the hosted runner! sudo sgdisk --zap-all -- "/dev/${BLOCK}" sudo dd if=/dev/zero of="/dev/${BLOCK}" bs=1M count=10 oflag=direct,dsync sudo parted -s "/dev/${BLOCK}" mklabel gpt fi sudo lsblk } function use_local_disk_for_integration_test() { sudo apt purge snapd -y sudo udevadm control --log-priority=debug sudo swapoff --all --verbose sudo umount /mnt sudo sed -i.bak '/\/mnt/d' /etc/fstab # search for the device since it keeps changing between sda and sdb PARTITION="/dev/${BLOCK}1" sudo wipefs --all --force "$PARTITION" sudo dd if=/dev/zero of="${PARTITION}" bs=1M count=1 sudo lsblk --bytes # add a udev rule to force the disk partitions to ceph # we have observed that some runners keep detaching/re-attaching the additional disk overriding the permissions to the default root:disk # for more details see: https://github.com/rook/rook/issues/7405 echo "SUBSYSTEM==\"block\", ATTR{size}==\"29356032\", ACTION==\"add\", RUN+=\"/bin/chown 167:167 $PARTITION\"" | sudo tee -a /etc/udev/rules.d/01-rook.rules # for below, see: https://access.redhat.com/solutions/1465913 echo "ACTION==\"add|change\", KERNEL==\"${BLOCK}\", OPTIONS:=\"nowatch\"" | sudo tee -a /etc/udev/rules.d/99-z-rook-nowatch.rules # The partition is still getting reloaded occasionally during operation. See https://github.com/rook/rook/issues/8975 # Try issuing some disk-inspection commands to jog the system so it won't reload the partitions # during OSD provisioning. sudo udevadm control --reload-rules || true sudo udevadm trigger || true time sudo udevadm settle || true sudo partprobe || true sudo lsblk --noheadings --pairs "/dev/${BLOCK}" || true sudo sgdisk --print "/dev/${BLOCK}" || true sudo udevadm info --query=property "/dev/${BLOCK}" || true sudo lsblk --noheadings --pairs "${PARTITION}" || true journalctl -o short-precise --dmesg | tail -40 || true cat /etc/fstab || true } function create_partitions_for_osds() { tests/scripts/create-bluestore-partitions.sh --disk "/dev/$BLOCK" --osd-count 2 sudo lsblk } function create_bluestore_partitions_and_pvcs() { BLOCK_PART="/dev/$BLOCK"2 DB_PART="/dev/$BLOCK"1 tests/scripts/create-bluestore-partitions.sh --disk "/dev/$BLOCK" --bluestore-type block.db --osd-count 1 tests/scripts/localPathPV.sh "$BLOCK_PART" "$DB_PART" } function create_bluestore_partitions_and_pvcs_for_wal() { BLOCK_PART="/dev/$BLOCK"3 DB_PART="/dev/$BLOCK"1 WAL_PART="/dev/$BLOCK"2 tests/scripts/create-bluestore-partitions.sh --disk "/dev/$BLOCK" --bluestore-type block.wal --osd-count 1 tests/scripts/localPathPV.sh "$BLOCK_PART" "$DB_PART" "$WAL_PART" } function collect_udev_logs_in_background() { local log_dir="${1:-"/home/runner/work/rook/rook/tests/integration/_output/tests"}" mkdir -p "${log_dir}" udevadm monitor --property &>"${log_dir}"/udev-monitor-property.txt & udevadm monitor --kernel &>"${log_dir}"/udev-monitor-kernel.txt & udevadm monitor --udev &>"${log_dir}"/udev-monitor-udev.txt & } function check_empty_file() { output_file=$1 if [ -s "$output_file" ]; then echo "script failed with stderr error" cat "$output_file" rm -f "$output_file" exit 1 fi } function build_rook() { build_type=build if [ -n "$1" ]; then build_type=$1 fi GOPATH=$(go env GOPATH) make clean for _ in $(seq 1 3); do if ! o=$(make -j"$(nproc)" IMAGES='ceph' "$build_type"); then case "$o" in *"$NETWORK_ERROR"*) echo "network failure occurred, retrying..." continue ;; *"$SERVICE_UNAVAILABLE_ERROR"*) echo "network failure occurred, retrying..." continue ;; *"$INTERNAL_ERROR"*) echo "network failure occurred, retrying..." continue ;; *"$INTERNAL_SERVER_ERROR"*) echo "network failure occurred, retrying..." continue ;; *) # valid failure echo "failed with the following log:" echo "$o" exit 1 ;; esac fi # no errors so we break the loop after the first iteration break done # validate build tests/scripts/validate_modified_files.sh build docker images if [[ "$build_type" == "build" ]]; then docker tag "$(docker images | awk '/build-/ {print $1}')" rook/ceph:local-build fi } function build_rook_all() { build_rook build.all } function validate_yaml() { cd deploy/examples kubectl create -f crds.yaml -f common.yaml -f csi/nfs/rbac.yaml # create the volume replication CRDs replication_version=v0.3.0 replication_url="https://raw.githubusercontent.com/csi-addons/volume-replication-operator/${replication_version}/config/crd/bases" kubectl create -f "${replication_url}/replication.storage.openshift.io_volumereplications.yaml" kubectl create -f "${replication_url}/replication.storage.openshift.io_volumereplicationclasses.yaml" #create the KEDA CRDS keda_version=2.4.0 keda_url="https://github.com/kedacore/keda/releases/download/v${keda_version}/keda-${keda_version}.yaml" kubectl apply -f "${keda_url}" #create the COSI CRDS cosi_crd_url="github.com/kubernetes-sigs/container-object-storage-interface-api" kubectl create -k "${cosi_crd_url}" # skipping folders and some yamls that are only for openshift. manifests="$(find . -maxdepth 1 -type f -name '*.yaml' -and -not -name '*openshift*' -and -not -name 'scc*' -and -not -name 'psp*' -and -not -name 'kustomization*')" with_f_arg="$(echo "$manifests" | awk '{printf " -f %s",$1}')" # don't add newline # shellcheck disable=SC2086 # '-f manifest1.yaml -f manifest2.yaml etc.' should not be quoted kubectl create ${with_f_arg} --dry-run=client } function create_cluster_prerequisites() { # this might be called from another function that has already done a cd (cd deploy/examples && kubectl create -f crds.yaml -f common.yaml -f csi/nfs/rbac.yaml) } function deploy_manifest_with_local_build() { sed -i 's/.*ROOK_CSI_ENABLE_NFS:.*/ ROOK_CSI_ENABLE_NFS: \"true\"/g' $1 if [[ "$USE_LOCAL_BUILD" != "false" ]]; then sed -i "s|image: rook/ceph:.*|image: rook/ceph:local-build|g" $1 fi if [[ "$ALLOW_LOOP_DEVICES" = "true" ]]; then sed -i "s|ROOK_CEPH_ALLOW_LOOP_DEVICES: \"false\"|ROOK_CEPH_ALLOW_LOOP_DEVICES: \"true\"|g" $1 fi sed -i "s|ROOK_LOG_LEVEL:.*|ROOK_LOG_LEVEL: DEBUG|g" "$1" kubectl create -f $1 } # Deploy toolbox with same ceph version as the cluster-test for ci function deploy_toolbox() { sed -i 's/image: quay\.io\/ceph\/ceph:.*/image: quay.io\/ceph\/ceph:v18/' toolbox.yaml kubectl create -f toolbox.yaml } function replace_ceph_image() { local file="$1" # parameter 1: the file in which to replace the ceph image local ceph_image="${2:-}" # parameter 2: the new ceph image to use if [[ -z ${ceph_image} ]]; then echo "No Ceph image given. Not adjusting manifests." return 0 fi sed -i "s|image: .*ceph/ceph:.*|image: ${ceph_image}|g" "${file}" } function deploy_cluster() { cd deploy/examples deploy_manifest_with_local_build operator.yaml if [ $# == 0 ]; then sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}|g" cluster-test.yaml elif [ "$1" = "two_osds_in_device" ]; then sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}\n config:\n osdsPerDevice: \"2\"|g" cluster-test.yaml elif [ "$1" = "osd_with_metadata_device" ]; then sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}\n config:\n metadataDevice: /dev/test-rook-vg/test-rook-lv|g" cluster-test.yaml elif [ "$1" = "osd_with_metadata_partition_device" ]; then yq w -i -d0 cluster-test.yaml spec.storage.devices[0].name ${BLOCK}2 yq w -i -d0 cluster-test.yaml spec.storage.devices[0].config.metadataDevice ${BLOCK}1 elif [ "$1" = "encryption" ]; then sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}\n config:\n encryptedDevice: \"true\"|g" cluster-test.yaml elif [ "$1" = "lvm" ]; then sed -i "s|#deviceFilter:|devices:\n - name: \"/dev/test-rook-vg/test-rook-lv\"|g" cluster-test.yaml elif [ "$1" = "loop" ]; then # add both /dev/sdX1 and loop device to test them at the same time sed -i "s|#deviceFilter:|devices:\n - name: \"${BLOCK}\"\n - name: \"/dev/loop1\"|g" cluster-test.yaml else echo "invalid argument: $*" >&2 exit 1 fi # enable monitoring yq w -i -d0 cluster-test.yaml spec.monitoring.enabled true kubectl create -f https://raw.githubusercontent.com/coreos/prometheus-operator/v0.71.1/bundle.yaml kubectl create -f monitoring/rbac.yaml # create the cluster resources kubectl create -f cluster-test.yaml kubectl create -f object-shared-pools-test.yaml kubectl create -f object-a.yaml kubectl create -f object-b.yaml kubectl create -f pool-test.yaml kubectl create -f filesystem-test.yaml sed -i "/resources:/,/ # priorityClassName:/d" rbdmirror.yaml kubectl create -f rbdmirror.yaml sed -i "/resources:/,/ # priorityClassName:/d" filesystem-mirror.yaml kubectl create -f filesystem-mirror.yaml kubectl create -f nfs-test.yaml kubectl create -f subvolumegroup.yaml deploy_toolbox } function deploy_csi_hostnetwork_disabled_cluster() { create_cluster_prerequisites cd deploy/examples sed -i 's/.*CSI_ENABLE_HOST_NETWORK:.*/ CSI_ENABLE_HOST_NETWORK: \"false\"/g' operator.yaml deploy_manifest_with_local_build operator.yaml if [ $# == 0 ]; then sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}|g" cluster-test.yaml elif [ "$1" = "two_osds_in_device" ]; then sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}\n config:\n osdsPerDevice: \"2\"|g" cluster-test.yaml elif [ "$1" = "osd_with_metadata_device" ]; then sed -i "s|#deviceFilter:|deviceFilter: ${BLOCK/\/dev\//}\n config:\n metadataDevice: /dev/test-rook-vg/test-rook-lv|g" cluster-test.yaml fi kubectl create -f nfs-test.yaml kubectl create -f cluster-test.yaml kubectl create -f filesystem-test.yaml deploy_toolbox } function wait_for_prepare_pod() { OSD_COUNT=$1 get_pod_cmd=(kubectl --namespace rook-ceph get pod --no-headers) timeout=450 start_time="${SECONDS}" while [[ $((SECONDS - start_time)) -lt $timeout ]]; do pod="$("${get_pod_cmd[@]}" --selector=app=rook-ceph-osd-prepare --output custom-columns=NAME:.metadata.name,PHASE:status.phase | awk 'FNR <= 1')" if echo "$pod" | grep 'Running\|Succeeded\|Failed'; then break; fi echo 'waiting for at least one osd prepare pod to be running or finished' sleep 5 done pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd-prepare --output name | awk 'FNR <= 1')" kubectl --namespace rook-ceph logs --follow "$pod" timeout=60 start_time="${SECONDS}" while [[ $((SECONDS - start_time)) -lt $timeout ]]; do pod_count="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd --output custom-columns=NAME:.metadata.name,PHASE:status.phase | grep --count 'Running' || true)" if [ "$pod_count" -ge "$OSD_COUNT" ]; then break; fi echo 'waiting for $OSD_COUNT OSD pod(s) to be running' sleep 1 done # getting the below logs is a best-effort attempt, so use '|| true' to allow failures pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 --output name)" || true kubectl --namespace rook-ceph logs "$pod" || true job="$(kubectl --namespace rook-ceph get job --selector app=rook-ceph-osd-prepare --output name | awk 'FNR <= 1')" || true kubectl -n rook-ceph describe "$job" || true kubectl -n rook-ceph describe deployment/rook-ceph-osd-0 || true } function wait_for_cleanup_pod() { timeout 180 bash <&2 && return 1 } function restart_operator() { local namespace="${1:-rook-ceph}" # optional param 1: the namespace of the CephCluster (default: rook-ceph) kubectl --namespace "$namespace" delete pod --selector app=rook-ceph-operator # wait for new pod to be running get_pod_cmd=(kubectl --namespace "$namespace" get pod --selector app=rook-ceph-operator --no-headers) timeout 20 bash -c \ "until [[ -n \"\$(${get_pod_cmd[*]} --field-selector=status.phase=Running 2>/dev/null)\" ]] ; do echo waiting && sleep 1; done" "${get_pod_cmd[@]}" } function get_clusterip() { local ns=${1?namespace is required} local cluster_name=${2?cluster name is required} kubectl -n "$ns" get svc "$cluster_name" -o jsonpath="{.spec.clusterIP}" } function get_secret_key() { local ns=${1?namespace is required} local secret_name=${2?secret name is required} local key=${3?skey is required} kubectl -n "$ns" get secrets "$secret_name" -o jsonpath="{.data.$key}" | base64 --decode } function s3cmd() { command timeout 20 s3cmd -v --config=s3cfg --access_key="${S3CMD_ACCESS_KEY}" --secret_key="${S3CMD_SECRET_KEY}" "$@" } function write_object_read_from_replica_cluster() { local write_cluster_ip=${1?ip address of cluster to write to is required} local read_cluster_ip=${2?ip address of cluster to read from is required} local test_bucket_name=${3?name of the test bucket is required} local test_object_name="${test_bucket_name}-1mib-test.dat" fallocate -l 1M "$test_object_name" # ensure that test file has unique data echo "$test_object_name" >>"$test_object_name" s3cmd --host="${write_cluster_ip}" mb "s3://${test_bucket_name}" s3cmd --host="${write_cluster_ip}" put "$test_object_name" "s3://${test_bucket_name}" # Schedule a signal for 60s into the future as a timeout on retrying s3cmd. # This voodoo is to avoid running everything under a new shell started by # `timeout`, as there would be no way to pass functions to as it wouldn't be # a direct sub-shell. S3CMD_ERROR=0 ( sleep 60 kill -s SIGUSR1 $$ ) 2>/dev/null & trap "{ S3CMD_ERROR=1; break; }" SIGUSR1 until s3cmd --host="${read_cluster_ip}" get "s3://${test_bucket_name}/${test_object_name}" "${test_object_name}.get" --force; do echo "waiting for object to be replicated" sleep 5 done if [[ $S3CMD_ERROR != 0 ]]; then echo "s3cmd failed" exit $S3CMD_ERROR fi diff "$test_object_name" "${test_object_name}.get" } function test_multisite_object_replication() { S3CMD_ACCESS_KEY=$(get_secret_key rook-ceph realm-a-keys access-key) readonly S3CMD_ACCESS_KEY S3CMD_SECRET_KEY=$(get_secret_key rook-ceph realm-a-keys secret-key) readonly S3CMD_SECRET_KEY local cluster_1_ip cluster_1_ip=$(get_clusterip rook-ceph rook-ceph-rgw-multisite-store) local cluster_2_ip cluster_2_ip=$(get_clusterip rook-ceph-secondary rook-ceph-rgw-zone-b-multisite-store) cd deploy/examples cat <<-EOF >s3cfg [default] host_bucket = no.way use_https = False EOF write_object_read_from_replica_cluster "$cluster_1_ip" "$cluster_2_ip" test1 write_object_read_from_replica_cluster "$cluster_2_ip" "$cluster_1_ip" test2 } function create_helm_tag() { helm_tag="$(cat _output/version)" build_image="$(docker images | awk '/build-/ {print $1}')" docker tag "${build_image}" "rook/ceph:${helm_tag}" } function deploy_multus() { # download the multus daemonset, and remove mem and cpu limits that cause it to crash on minikube curl https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset-thick.yml | sed -e 's/cpu: /# cpu: /g' -e 's/memory: /# memory: /g' | kubectl apply -f - # install whereabouts kubectl apply \ -f https://raw.githubusercontent.com/k8snetworkplumbingwg/whereabouts/master/doc/crds/daemonset-install.yaml \ -f https://github.com/k8snetworkplumbingwg/whereabouts/raw/master/doc/crds/whereabouts.cni.cncf.io_ippools.yaml \ -f https://github.com/k8snetworkplumbingwg/whereabouts/raw/master/doc/crds/whereabouts.cni.cncf.io_overlappingrangeipreservations.yaml # create the rook-ceph namespace if it doesn't exist, the NAD will go in this namespace kubectl create namespace rook-ceph || true # install network attachment definitions IFACE="eth0" # the runner has eth0 so we don't need any heuristics to find the interface kubectl apply -f - <