Lesson 11.2: Backup and Restore Strategies


root@dev-control-plane:/etc/kubernetes/manifests# pwd
/etc/kubernetes/manifests
 
root@dev-control-plane:/etc/kubernetes/manifests# cat etcd.yaml 
apiVersion: v1
kind: Pod
metadata:
  annotations:
    kubeadm.kubernetes.io/etcd.advertise-client-urls: https://172.18.0.3:2379
  creationTimestamp: null
  labels:
    component: etcd
    tier: control-plane
  name: etcd
  namespace: kube-system
spec:
  containers:
  - command:
    - etcd
    - --advertise-client-urls=https://172.18.0.3:2379
    - --cert-file=/etc/kubernetes/pki/etcd/server.crt
    - --client-cert-auth=true
    - --data-dir=/var/lib/etcd
    - --experimental-initial-corrupt-check=true
    - --experimental-watch-progress-notify-interval=5s
    - --initial-advertise-peer-urls=https://172.18.0.3:2380
    - --initial-cluster=dev-control-plane=https://172.18.0.3:2380
    - --key-file=/etc/kubernetes/pki/etcd/server.key
    - --listen-client-urls=https://127.0.0.1:2379,https://172.18.0.3:2379
    - --listen-metrics-urls=http://127.0.0.1:2381
    - --listen-peer-urls=https://172.18.0.3:2380
    - --name=dev-control-plane
    - --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
    - --peer-client-cert-auth=true
    - --peer-key-file=/etc/kubernetes/pki/etcd/peer.key
    - --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
    - --snapshot-count=10000
    - --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
    image: registry.k8s.io/etcd:3.5.16-0
    imagePullPolicy: IfNotPresent
    livenessProbe:
      failureThreshold: 8
      httpGet:
        host: 127.0.0.1
        path: /livez
        port: 2381
        scheme: HTTP
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    name: etcd
    readinessProbe:
      failureThreshold: 3
      httpGet:
        host: 127.0.0.1
        path: /readyz
        port: 2381
        scheme: HTTP
      periodSeconds: 1
      timeoutSeconds: 15
    resources:
      requests:
        cpu: 100m
        memory: 100Mi
    startupProbe:
      failureThreshold: 24
      httpGet:
        host: 127.0.0.1
        path: /readyz
        port: 2381
        scheme: HTTP
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    volumeMounts:
    - mountPath: /var/lib/etcd
      name: etcd-data
    - mountPath: /etc/kubernetes/pki/etcd
      name: etcd-certs
  hostNetwork: true
  priority: 2000001000
  priorityClassName: system-node-critical
  securityContext:
    seccompProfile:
      type: RuntimeDefault
  volumes:
  - hostPath:
      path: /etc/kubernetes/pki/etcd
      type: DirectoryOrCreate
    name: etcd-certs
  - hostPath:
      path: /var/lib/etcd
      type: DirectoryOrCreate
    name: etcd-data
status: {}

Install etcd client

root@dev-control-plane:~# apt-get update && apt-get install etcd-client -y 

View options

root@dev-control-plane:~# ETCDCTL_API=3 etcdctl snapshot
NAME:
	snapshot - Manages etcd node snapshots
 
USAGE:
	etcdctl snapshot <subcommand> [flags]
 
API VERSION:
	3.4
 
 
COMMANDS:
	restore	Restores an etcd member snapshot to an etcd directory
	save	Stores an etcd node backend snapshot to a given file
	status	Gets backend snapshot status of a given file
 
OPTIONS:
  -h, --help[=false]	help for snapshot
 
GLOBAL OPTIONS:
      --cacert=""				verify certificates of TLS-enabled secure servers using this CA bundle
      --cert=""					identify secure client using this TLS certificate file
      --command-timeout=5s			timeout for short running command (excluding dial timeout)
      --debug[=false]				enable client-side debug logging
      --dial-timeout=2s				dial timeout for client connections
  -d, --discovery-srv=""			domain name to query for SRV records describing cluster endpoints
      --discovery-srv-name=""			service name to query when using DNS discovery
      --endpoints=[127.0.0.1:2379]		gRPC endpoints
      --hex[=false]				print byte strings as hex encoded strings
      --insecure-discovery[=true]		accept insecure SRV records describing cluster endpoints
      --insecure-skip-tls-verify[=false]	skip server certificate verification (CAUTION: this option should be enabled only for testing purposes)
      --insecure-transport[=true]		disable transport security for client connections
      --keepalive-time=2s			keepalive time for client connections
      --keepalive-timeout=6s			keepalive timeout for client connections
      --key=""					identify secure client using this TLS key file
      --password=""				password for authentication (if this option is used, --user option shouldn't include password)
      --user=""					username[:password] for authentication (prompt if password is not supplied)
  -w, --write-out="simple"			set the output format (fields, json, protobuf, simple, table)

Setting as env variable

root@dev-control-plane:~# export ETCDCTL_API=3

Snapshot using etcdctl options

root@dev-control-plane:~# etcdctl --endpoints=https://127.0.0.1:2379 \
> --cacert=/etc/kubernetes/pki/etcd/ca.crt \
> --cert=/etc/kubernetes/pki/etcd/server.crt \
> --key=/etc/kubernetes/pki/etcd/server.key \
> snapshot save /opt/etcd-backup.db
 
{"level":"info","ts":1742096447.892546,"caller":"snapshot/v3_snapshot.go:119","msg":"created temporary db file","path":"/opt/etcd-backup.db.part"}
{"level":"info","ts":"2025-03-16T03:40:47.897Z","caller":"clientv3/maintenance.go:200","msg":"opened snapshot stream; downloading"}
{"level":"info","ts":1742096447.8971481,"caller":"snapshot/v3_snapshot.go:127","msg":"fetching snapshot","endpoint":"https://127.0.0.1:2379"}
{"level":"info","ts":"2025-03-16T03:40:47.922Z","caller":"clientv3/maintenance.go:208","msg":"completed snapshot read; closing"}
{"level":"info","ts":1742096447.9275408,"caller":"snapshot/v3_snapshot.go:142","msg":"fetched snapshot","endpoint":"https://127.0.0.1:2379","size":"5.1 MB","took":0.034611291}
{"level":"info","ts":1742096447.9276412,"caller":"snapshot/v3_snapshot.go:152","msg":"saved","path":"/opt/etcd-backup.db"}
Snapshot saved at /opt/etcd-backup.db
 
# Checking the size 
root@dev-control-plane:~# du -sh /opt/etcd-backup.db 
4.9M	/opt/etcd-backup.db
root@dev-control-plane:~# etcdctl --write-out=table snapshot status /opt/etcd-backup.db 
+----------+----------+------------+------------+
|   HASH   | REVISION | TOTAL KEYS | TOTAL SIZE |
+----------+----------+------------+------------+
| f28a8279 |   116596 |       1197 |     5.1 MB |
+----------+----------+------------+------------+

Deleting something`

root@dev-control-plane:~# kubectl delete svc hello-world 
service "hello-world" deleted
root@dev-control-plane:~# kubectl delete svc hello-world2
service "hello-world2" deleted
root@dev-control-plane:~# kubectl get svc 
NAME         TYPE        CLUSTER-IP   EXTERNAL-IP   PORT(S)   AGE
kubernetes   ClusterIP   10.96.0.1    <none>        443/TCP   6d23h

Restoting

root@dev-control-plane:~# etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key snapshot restore /opt/etcd-backup.db --data-dir=/var/lib/etcd-restore-from-backup
{"level":"info","ts":1742097064.4672978,"caller":"snapshot/v3_snapshot.go:296","msg":"restoring snapshot","path":"/opt/etcd-backup.db","wal-dir":"/var/lib/etcd-restore-from-backup/member/wal","data-dir":"/var/lib/etcd-restore-from-backup","snap-dir":"/var/lib/etcd-restore-from-backup/member/snap"}
{"level":"info","ts":1742097064.4904122,"caller":"mvcc/kvstore.go:388","msg":"restored last compact revision","meta-bucket-name":"meta","meta-bucket-name-key":"finishedCompactRev","restored-compact-revision":115805}
{"level":"info","ts":1742097064.4958644,"caller":"membership/cluster.go:392","msg":"added member","cluster-id":"cdf818194e3a8c32","local-member-id":"0","added-peer-id":"8e9e05c52164694d","added-peer-peer-urls":["http://localhost:2380"]}
{"level":"info","ts":1742097064.4998891,"caller":"snapshot/v3_snapshot.go:309","msg":"restored snapshot","path":"/opt/etcd-backup.db","wal-dir":"/var/lib/etcd-restore-from-backup/member/wal","data-dir":"/var/lib/etcd-restore-from-backup","snap-dir":"/var/lib/etcd-restore-from-backup/member/snap"}
 
root@dev-control-plane:/var/lib# cd etcd-restore-from-backup/
root@dev-control-plane:/var/lib/etcd-restore-from-backup# ls
member
 
# Changet the value of data dir , volumes and volume mount to new
root@dev-control-plane:/etc/kubernetes/manifests# pwd
/etc/kubernetes/manifests
root@dev-control-plane:/etc/kubernetes/manifests# vim etcd.yaml 
root@dev-control-plane:/etc/kubernetes/manifests# cat etcd.yaml 
apiVersion: v1
kind: Pod
metadata:
  annotations:
    kubeadm.kubernetes.io/etcd.advertise-client-urls: https://172.18.0.3:2379
  creationTimestamp: null
  labels:
    component: etcd
    tier: control-plane
  name: etcd
  namespace: kube-system
spec:
  containers:
  - command:
    - etcd
    - --advertise-client-urls=https://172.18.0.3:2379
    - --cert-file=/etc/kubernetes/pki/etcd/server.crt
    - --client-cert-auth=true
    - --data-dir=/var/lib/etcd-restore-from-backup              # Changed
    - --experimental-initial-corrupt-check=true
    - --experimental-watch-progress-notify-interval=5s
    - --initial-advertise-peer-urls=https://172.18.0.3:2380
    - --initial-cluster=dev-control-plane=https://172.18.0.3:2380
    - --key-file=/etc/kubernetes/pki/etcd/server.key
    - --listen-client-urls=https://127.0.0.1:2379,https://172.18.0.3:2379
    - --listen-metrics-urls=http://127.0.0.1:2381
    - --listen-peer-urls=https://172.18.0.3:2380
    - --name=dev-control-plane
    - --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
    - --peer-client-cert-auth=true
    - --peer-key-file=/etc/kubernetes/pki/etcd/peer.key
    - --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
    - --snapshot-count=10000
    - --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
    image: registry.k8s.io/etcd:3.5.16-0
    imagePullPolicy: IfNotPresent
    livenessProbe:
      failureThreshold: 8
      httpGet:
        host: 127.0.0.1
        path: /livez
        port: 2381
        scheme: HTTP
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    name: etcd
    readinessProbe:
      failureThreshold: 3
      httpGet:
        host: 127.0.0.1
        path: /readyz
        port: 2381
        scheme: HTTP
      periodSeconds: 1
      timeoutSeconds: 15
    resources:
      requests:
        cpu: 100m
        memory: 100Mi
    startupProbe:
      failureThreshold: 24
      httpGet:
        host: 127.0.0.1
        path: /readyz
        port: 2381
        scheme: HTTP
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    volumeMounts:
    - mountPath: /var/lib/etcd-restore-from-backup      # Changed
      name: etcd-data
    - mountPath: /etc/kubernetes/pki/etcd
      name: etcd-certs
  hostNetwork: true
  priority: 2000001000
  priorityClassName: system-node-critical
  securityContext:
    seccompProfile:
      type: RuntimeDefault
  volumes:
  - hostPath:
      path: /etc/kubernetes/pki/etcd
      type: DirectoryOrCreate
    name: etcd-certs
  - hostPath:
      path: /var/lib/etcd-restore-from-backup           # Changed 
      type: DirectoryOrCreate
    name: etcd-data
status: {}
 
# Restart 
root@dev-control-plane:/etc/kubernetes/manifests# ls
etcd.yaml  kube-apiserver.yaml	kube-controller-manager.yaml  kube-scheduler.yaml
root@dev-control-plane:/etc/kubernetes/manifests# mv * /tmp/ 
root@dev-control-plane:/etc/kubernetes/manifests# ls /tmp 
etcd.yaml  kube-apiserver.yaml	kube-controller-manager.yaml  kube-scheduler.yaml
root@dev-control-plane:/etc/kubernetes/manifests# mv /tmp/*.yaml . 
root@dev-control-plane:/etc/kubernetes/manifests# ls
etcd.yaml  kube-apiserver.yaml	kube-controller-manager.yaml  kube-scheduler.yaml
 
# Etcd is running 
root@dev-control-plane:/etc/kubernetes/manifests# kubectl get pods -n=kube-system | grep etcd 
etcd-dev-control-plane                      1/1     Running       0               17h
 
root@dev-control-plane:~# systemctl restart kubelet 
root@dev-control-plane:~# systemctl daemon-reload
root@dev-control-plane:~# kubectl get pods -n=kube-system | grep etcd  
etcd-dev-control-plane                      1/1     Running       0               94s
 
# Check the etcd-data path if changed
root@dev-control-plane:~# kubectl describe pod -n=kube-system etcd-dev-control-plane 
Name:                 etcd-dev-control-plane
Namespace:            kube-system
Priority:             2000001000
Priority Class Name:  system-node-critical
Node:                 dev-control-plane/172.18.0.3
Start Time:           Sun, 16 Mar 2025 04:06:39 +0000
Labels:               component=etcd
                      tier=control-plane
Annotations:          kubeadm.kubernetes.io/etcd.advertise-client-urls: https://172.18.0.3:2379
                      kubernetes.io/config.hash: aa21150d0f2a2c19c766d77f37b9e316
                      kubernetes.io/config.mirror: aa21150d0f2a2c19c766d77f37b9e316
                      kubernetes.io/config.seen: 2025-03-16T04:04:09.095428447Z
                      kubernetes.io/config.source: file
Status:               Running
SeccompProfile:       RuntimeDefault
IP:                   172.18.0.3
IPs:
  IP:           172.18.0.3
Controlled By:  Node/dev-control-plane
Containers:
  etcd:
    Container ID:  containerd://4aa6882b32a3bc958e1ee8a83511a84849bcb4f1a7b7625c3eba0f6ffab9bf63
    Image:         registry.k8s.io/etcd:3.5.16-0
    Image ID:      sha256:7fc9d4aa817aa6a3e549f3cd49d1f7b496407be979fc36dd5f356d59ce8c3a82
    Port:          <none>
    Host Port:     <none>
    Command:
      etcd
      --advertise-client-urls=https://172.18.0.3:2379
      --cert-file=/etc/kubernetes/pki/etcd/server.crt
      --client-cert-auth=true
      --data-dir=/var/lib/etcd-restore-from-backup
      --experimental-initial-corrupt-check=true
      --experimental-watch-progress-notify-interval=5s
      --initial-advertise-peer-urls=https://172.18.0.3:2380
      --initial-cluster=dev-control-plane=https://172.18.0.3:2380
      --key-file=/etc/kubernetes/pki/etcd/server.key
      --listen-client-urls=https://127.0.0.1:2379,https://172.18.0.3:2379
      --listen-metrics-urls=http://127.0.0.1:2381
      --listen-peer-urls=https://172.18.0.3:2380
      --name=dev-control-plane
      --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
      --peer-client-cert-auth=true
      --peer-key-file=/etc/kubernetes/pki/etcd/peer.key
      --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
      --snapshot-count=10000
      --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
    State:          Running
      Started:      Sun, 16 Mar 2025 04:04:09 +0000
    Ready:          True
    Restart Count:  0
    Requests:
      cpu:        100m
      memory:     100Mi
    Liveness:     http-get http://127.0.0.1:2381/livez delay=10s timeout=15s period=10s #success=1 #failure=8
    Readiness:    http-get http://127.0.0.1:2381/readyz delay=0s timeout=15s period=1s #success=1 #failure=3
    Startup:      http-get http://127.0.0.1:2381/readyz delay=10s timeout=15s period=10s #success=1 #failure=24
    Environment:  <none>
    Mounts:
      /etc/kubernetes/pki/etcd from etcd-certs (rw)
      /var/lib/etcd-restore-from-backup from etcd-data (rw)
Conditions:
  Type                        Status
  PodReadyToStartContainers   True 
  Initialized                 True 
  Ready                       True 
  ContainersReady             True 
  PodScheduled                True 
Volumes:
  etcd-certs:
    Type:          HostPath (bare host directory volume)
    Path:          /etc/kubernetes/pki/etcd
    HostPathType:  DirectoryOrCreate
  etcd-data:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/etcd-restore-from-backup
    HostPathType:  DirectoryOrCreate
QoS Class:         Burstable
Node-Selectors:    <none>
Tolerations:       :NoExecute op=Exists
Events:
  Type    Reason          Age                    From     Message
  ----    ------          ----                   ----     -------
  Normal  Killing         3m38s (x2 over 4m5s)   kubelet  Stopping container etcd
  Normal  Pulled          3m26s (x3 over 4m6s)   kubelet  Container image "registry.k8s.io/etcd:3.5.16-0" already present on machine
  Normal  Created         3m26s (x3 over 4m6s)   kubelet  Created container: etcd
  Normal  Started         3m26s (x3 over 4m6s)   kubelet  Started container etcd
  Normal  SandboxChanged  3m26s (x2 over 3m45s)  kubelet  Pod sandbox changed, it will be killed and re-created.

Check the previous resources available

root@dev-control-plane:/etc/kubernetes/manifests# kubectl get svc 
NAME           TYPE        CLUSTER-IP    EXTERNAL-IP   PORT(S)        AGE
hello-world    ClusterIP   10.96.49.80   <none>        80/TCP         21h
hello-world2   NodePort    10.96.10.83   <none>        80:32059/TCP   20h
kubernetes     ClusterIP   10.96.0.1     <none>        443/TCP        6d23h

References:

All systems normal

© 2025 2023 Sanjeeb KC. All rights reserved.