RKE 创建 one-time snapshot 报 etcd 相关报错

bastiwang26 · 2022 年12 月 26 日 02:45

RKE 版本: v1.3.17

Docker 版本:19.03.15

操作系统和内核: SLES15-SP3 . 5.3.18-57-default

主机类型和供应商: VMware Workstation

cluster.yml 文件:
nodes:

address: red-master
port: “22”
internal_address: “”
role:
- controlplane
- worker
- etcd
  hostname_override: “”
  user: root
  docker_socket: /var/run/docker.sock
  ssh_key: “”
  ssh_key_path: ~/.ssh/id_rsa
  ssh_cert: “”
  ssh_cert_path: “”
  labels: {}
  taints:
  services:
  etcd:
  image: “”
  extra_args: {}
  extra_args_array: {}
  extra_binds:
  extra_env:
  win_extra_args: {}
  win_extra_args_array: {}
  win_extra_binds:
  win_extra_env:
  external_urls:
  ca_cert: “”
  cert: “”
  key: “”
  path: “”
  uid: 0
  gid: 0
  snapshot: null
  retention: “”
  creation: “”
  backup_config: null
  kube-api:
  image: “”
  extra_args: {}
  extra_args_array: {}
  extra_binds:
  extra_env:
  win_extra_args: {}
  win_extra_args_array: {}
  win_extra_binds:
  win_extra_env:
  service_cluster_ip_range: 10.43.0.0/16
  service_node_port_range: “”
  pod_security_policy: false
  always_pull_images: false
  secrets_encryption_config: null
  audit_log: null
  admission_configuration: null
  event_rate_limit: null
  kube-controller:
  image: “”
  extra_args: {}
  extra_args_array: {}
  extra_binds:
  extra_env:
  win_extra_args: {}
  win_extra_args_array: {}
  win_extra_binds:
  win_extra_env:
  cluster_cidr: 10.42.0.0/16
  service_cluster_ip_range: 10.43.0.0/16
  scheduler:
  image: “”
  extra_args: {}
  extra_args_array: {}
  extra_binds:
  extra_env:
  win_extra_args: {}
  win_extra_args_array: {}
  win_extra_binds:
  win_extra_env:
  kubelet:
  image: “”
  extra_args: {}
  extra_args_array: {}
  extra_binds:
  extra_env:
  win_extra_args: {}
  win_extra_args_array: {}
  win_extra_binds:
  win_extra_env:
  cluster_domain: cluster.local
  infra_container_image: “”
  cluster_dns_server: 10.43.0.10
  fail_swap_on: false
  generate_serving_certificate: false
  kubeproxy:
  image: “”
  extra_args: {}
  extra_args_array: {}
  extra_binds:
  extra_env:
  win_extra_args: {}
  win_extra_args_array: {}
  win_extra_binds:
  win_extra_env:
  network:
  plugin: canal
  options: {}
  mtu: 0
  node_selector: {}
  update_strategy: null
  tolerations:
  authentication:
  strategy: x509
  sans:
  webhook: null
  addons: “”
  addons_include:
  system_images:
  etcd: rancher/mirrored-coreos-etcd:v3.5.4
  alpine: rancher/rke-tools:v0.1.88
  nginx_proxy: rancher/rke-tools:v0.1.88
  cert_downloader: rancher/rke-tools:v0.1.88
  kubernetes_services_sidecar: rancher/rke-tools:v0.1.88
  kubedns: rancher/mirrored-k8s-dns-kube-dns:1.21.1
  dnsmasq: rancher/mirrored-k8s-dns-dnsmasq-nanny:1.21.1
  kubedns_sidecar: rancher/mirrored-k8s-dns-sidecar:1.21.1
  kubedns_autoscaler: rancher/mirrored-cluster-proportional-autoscaler:1.8.5
  coredns: rancher/mirrored-coredns-coredns:1.9.3
  coredns_autoscaler: rancher/mirrored-cluster-proportional-autoscaler:1.8.5
  nodelocal: rancher/mirrored-k8s-dns-node-cache:1.21.1
  kubernetes: rancher/hyperkube:v1.24.8-rancher1
  flannel: rancher/mirrored-coreos-flannel:v0.15.1
  flannel_cni: rancher/flannel-cni:v0.3.0-rancher6
  calico_node: rancher/mirrored-calico-node:v3.22.0
  calico_cni: rancher/calico-cni:v3.22.0-rancher1
  calico_controllers: rancher/mirrored-calico-kube-controllers:v3.22.0
  calico_ctl: rancher/mirrored-calico-ctl:v3.22.0
  calico_flexvol: rancher/mirrored-calico-pod2daemon-flexvol:v3.22.0
  canal_node: rancher/mirrored-calico-node:v3.22.0
  canal_cni: rancher/calico-cni:v3.22.0-rancher1
  canal_controllers: rancher/mirrored-calico-kube-controllers:v3.22.0
  canal_flannel: rancher/mirrored-flannelcni-flannel:v0.17.0
  canal_flexvol: rancher/mirrored-calico-pod2daemon-flexvol:v3.22.0
  weave_node: weaveworks/weave-kube:2.8.1
  weave_cni: weaveworks/weave-npc:2.8.1
  pod_infra_container: rancher/mirrored-pause:3.6
  ingress: rancher/nginx-ingress-controller:nginx-1.2.1-rancher1
  ingress_backend: rancher/mirrored-nginx-ingress-controller-defaultbackend:1.5-rancher1
  ingress_webhook: rancher/mirrored-ingress-nginx-kube-webhook-certgen:v1.1.1
  metrics_server: rancher/mirrored-metrics-server:v0.6.1
  windows_pod_infra_container: rancher/mirrored-pause:3.6
  aci_cni_deploy_container: noiro/cnideploy:5.2.3.4.1d150da
  aci_host_container: noiro/aci-containers-host:5.2.3.4.1d150da
  aci_opflex_container: noiro/opflex:5.2.3.4.1d150da
  aci_mcast_container: noiro/opflex:5.2.3.4.1d150da
  aci_ovs_container: noiro/openvswitch:5.2.3.4.1d150da
  aci_controller_container: noiro/aci-containers-controller:5.2.3.4.1d150da
  aci_gbp_server_container: noiro/gbp-server:5.2.3.4.1d150da
  aci_opflex_server_container: noiro/opflex-server:5.2.3.4.1d150da
  ssh_key_path: ~/.ssh/id_rsa
  ssh_cert_path: “”
  ssh_agent_auth: false
  authorization:
  mode: rbac
  options: {}
  ignore_docker_version: null
  enable_cri_dockerd: null
  kubernetes_version: “”
  private_registries:
  ingress:
  provider: “”
  options: {}
  node_selector: {}
  extra_args: {}
  dns_policy: “”
  extra_envs:
  extra_volumes:
  extra_volume_mounts:
  update_strategy: null
  http_port: 0
  https_port: 0
  network_mode: “”
  tolerations:
  default_backend: null
  default_http_backend_priority_class_name: “”
  nginx_ingress_controller_priority_class_name: “”
  default_ingress_class: null
  cluster_name: “”
  cloud_provider:
  name: “”
  prefix_path: “”
  win_prefix_path: “”
  addon_job_timeout: 0
  bastion_host:
  address: “”
  port: “”
  user: “”
  ssh_key: “”
  ssh_key_path: “”
  ssh_cert: “”
  ssh_cert_path: “”
  ignore_proxy_env_vars: false
  monitoring:
  provider: “”
  options: {}
  node_selector: {}
  update_strategy: null
  replicas: null
  tolerations:
  metrics_server_priority_class_name: “”
  restore:
  restore: false
  snapshot_name: “”
  rotate_encryption_key: false
  dns: null

重现步骤:

red-master:~/RKE-config # rke etcd snapshot-save --config cluster.yml
INFO[0000] Running RKE version: v1.3.17
WARN[0000] Name of the snapshot is not specified, using [rke_etcd_snapshot_2022-12-26T10:41:17+08:00]
INFO[0000] Starting saving snapshot on etcd hosts
INFO[0000] [dialer] Setup tunnel for host [red-master]
INFO[0000] [state] Deploying state file to [/etc/kubernetes/rke_etcd_snapshot_2022-12-26T10:41:17+08:00.rkestate] on host [red-master]
INFO[0000] Image [rancher/rke-tools:v0.1.88] exists on host [red-master]
INFO[0000] Starting container [cluster-state-deployer] on host [red-master], try #1
INFO[0001] [state] Successfully started [cluster-state-deployer] container on host [red-master]
INFO[0001] Waiting for [cluster-state-deployer] container to exit on host [red-master]
INFO[0001] Container [cluster-state-deployer] is still running on host [red-master]: stderr: , stdout: [Waiting for file [/etc/kubernetes/cluster.rkestate] to be successfully copied to this container, retry count 1
]
INFO[0002] Container [cluster-state-deployer] is still running on host [red-master]: stderr: , stdout: [Waiting for file [/etc/kubernetes/cluster.rkestate] to be successfully copied to this container, retry count 1
]
INFO[0003] Container [cluster-state-deployer] is still running on host [red-master]: stderr: , stdout: [Waiting for file [/etc/kubernetes/cluster.rkestate] to be successfully copied to this container, retry count 1
]
INFO[0004] Container [cluster-state-deployer] is still running on host [red-master]: stderr: , stdout: [Waiting for file [/etc/kubernetes/cluster.rkestate] to be successfully copied to this container, retry count 1
]
INFO[0005] Container [cluster-state-deployer] is still running on host [red-master]: stderr: , stdout: [Waiting for file [/etc/kubernetes/cluster.rkestate] to be successfully copied to this container, retry count 1
]
INFO[0006] Removing container [cluster-state-deployer] on host [red-master], try #1
INFO[0006] [remove/cluster-state-deployer] Successfully removed container on host [red-master]
INFO[0006] [etcd] Running snapshot save once on host [red-master]
INFO[0006] Finding container [etcd] on host [red-master], try #1
INFO[0006] Image [rancher/rke-tools:v0.1.88] exists on host [red-master]
INFO[0006] Starting container [etcd-snapshot-once] on host [red-master], try #1
INFO[0007] [etcd] Successfully started [etcd-snapshot-once] container on host [red-master]
INFO[0007] Waiting for [etcd-snapshot-once] container to exit on host [red-master]
INFO[0007] Container [etcd-snapshot-once] is still running on host [red-master]: stderr: [time=“2022-12-26T02:41:24Z” level=info msg=“Initializing Onetime Backup” name=“rke_etcd_snapshot_2022-12-26T10:41:17+08:00”
], stdout:
INFO[0012] Container [etcd-snapshot-once] is still running on host [red-master]: stderr: [time=“2022-12-26T02:41:29Z” level=fatal msg=“exit status 1: {“level”:“warn”,“ts”:“2022-12-26T02:41:29.932Z”,“caller”:“clientv3/retry_interceptor.go:62”,“msg”:“retrying of unary invoker failed”,“target”:“endpoint://client-8048ea08-bfbc-4226-b797-05268850ede5/red-master:2379”,“attempt”:0,“error”:“rpc error: code = DeadlineExceeded desc = context deadline exceeded”}\nred-master:2379 is unhealthy: failed to commit proposal: context deadline exceeded\nError: unhealthy cluster\n”
], stdout:
INFO[0013] Removing container [etcd-snapshot-once] on host [red-master], try #1
WARN[0013] [etcd] Failed to take snapshot on all etcd hosts: failed to take one-time snapshot on host [red-master], exit code [1]: time=“2022-12-26T02:41:29Z” level=fatal msg=“exit status 1: {“level”:“warn”,“ts”:“2022-12-26T02:41:29.932Z”,“caller”:“clientv3/retry_interceptor.go:62”,“msg”:“retrying of unary invoker failed”,“target”:“endpoint://client-8048ea08-bfbc-4226-b797-05268850ede5/red-master:2379”,“attempt”:0,“error”:“rpc error: code = DeadlineExceeded desc = context deadline exceeded”}\nred-master:2379 is unhealthy: failed to commit proposal: context deadline exceeded\nError: unhealthy cluster\n”
FATA[0013] [etcd] Failed to take snapshot on all etcd hosts: failed to take one-time snapshot on host [red-master], exit code [1]: time=“2022-12-26T02:41:29Z” level=fatal msg=“exit status 1: {“level”:“warn”,“ts”:“2022-12-26T02:41:29.932Z”,“caller”:“clientv3/retry_interceptor.go:62”,“msg”:“retrying of unary invoker failed”,“target”:“endpoint://client-8048ea08-bfbc-4226-b797-05268850ede5/red-master:2379”,“attempt”:0,“error”:“rpc error: code = DeadlineExceeded desc = context deadline exceeded”}\nred-master:2379 is unhealthy: failed to commit proposal: context deadline exceeded\nError: unhealthy cluster\n”

尝试在单节点 RKE 集群中执行 one-time snapshot，报以上错误，还请帮忙看看这个是配置问题？还是版本问题？

谢谢。