kubernetes pod 高级实战:labels标签、node选择器、基于污点、容忍度,亲和度的多种调度策略

近期文章:使用 ansible 一键安装kubernetes+containerd+calico集群

labels标签应用场景

什么是标签

标签其实就一对 key/value ,被关联到对象上,比如Pod,标签的使用我们倾向于能够表示对象的特殊特点,就是一眼就看出了这个Pod是干什么的,标签可以用来划分特定的对象(比如版本,服务类型等),标签可以在创建一个对象的时候直接定义,也可以在后期随时修改,每一个对象可以拥有多个标签,但是,key值必须是唯一的。创建标签之后也可以方便我们对资源进行分组管理。如果对pod打标签,之后就可以使用标签来查看、删除指定的pod。 在k8s中,大部分资源都可以打标签。

给pod打标签

[root@pengfei-master1 pod]# cat pod_frist.yaml
apiVersion: v1
kind: Pod
metadata:
  labels:
    app: nginx
  name: nginx-test
spec:
  containers:
  - name: nginx
    image: nginx
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 80
   
[root@pengfei-master1 pod]# kubectl apply -f pod_frist.yaml
#对已经存在的pod打标签
[root@pengfei-master1 pod] kubectl label pods nginx-test  release=v1
pod/nginx-test labeled
#查看标签是否打成功:
[root@pengfei-master1 pod] kubectl get pods nginx-test --show-labels
NAME         READY   STATUS    RESTARTS   AGE   LABELS
nginx-test   1/1     Running   0          50s   app=nginx,release=v1 #说明标签达成功了

查看标签

#查看默认名称空间下指定pod具有的所有标签
[root@pengfei-master1 pod] kubectl get pods nginx-test --show-labels
#查看默认名称空间下所有pod资源的标签
[root@pengfei-master1 pod]# kubectl get pods  --show-labels
NAME                     READY   STATUS    RESTARTS   AGE     LABELS
nginx-59c76fb6b9-kzwnd   1/1     Running   0          2d21h   app=nginx,pod-template-hash=59c76fb6b9
nginx-59c76fb6b9-z4sdq   1/1     Running   0          2d21h   app=nginx,pod-template-hash=59c76fb6b9
nginx-test               1/1     Running   0          15m     app=nginx,release=v1


#列出默认名称空间下标签key是release,不显示标签
[root@pengfei-master1 pod]# kubectl get pods -l release
NAME         READY   STATUS    RESTARTS   AGE
nginx-test   1/1     Running   0          19m
#列出默认名称空间下标签key是release、值是v1的pod,不显示标签
[root@pengfei-master1 pod]# kubectl get pods -l release=v1
NAME         READY   STATUS    RESTARTS   AGE
nginx-test   1/1     Running   0          19m

#列出默认名称空间下标签key是release的所有pod,并打印对应的标签值
[root@pengfei-master1 pod]# kubectl get pods -L release

#查看所有名称空间下的所有pod的标签
[root@pengfei-master1 pod]# kubectl get pods --all-namespaces --show-labels

node选择器

我们在创建pod资源的时候,pod会根据schduler进行调度,默认会调度到随机的一个工作节点,如果我们想要pod调度到指定节点或者调度到一些具有相同特点的node节点,可以使用pod中的nodeName或者nodeSelector字段指定要调度到的node节点

nodeName字段

#查看node节点
[root@pengfei-master1 pod]# kubectl get nodes
NAME              STATUS   ROLES           AGE    VERSION
pengfei-master1   Ready    control-plane   6d1h   v1.25.0
pengfei-node1     Ready    work            6d1h   v1.25.0
pengfei-node2     Ready    work            6d     v1.25.0

将pod调度到node2节点上

[root@pengfei-master1 pod]# cat pod_node.yaml 
apiVersion: v1
kind: Pod
metadata:
  labels:
    app: tomcat-node
  name:
    tomcat-node
spec:
  nodeName: pengfei-node2 #将pod调度到node2节点上
  containers:
  - name: tomcat-node
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080

创建pod

[root@pengfei-master1 pod]# kubectl apply -f pod_node.yaml 

查看pod调度到哪个节点

[root@pengfei-master1 pod]# kubectl get pods tomcat-node -o wide
NAME          READY   STATUS    RESTARTS   AGE     IP              NODE            NOMINATED NODE   READINESS GATES
tomcat-node   1/1     Running   0          4m13s   10.244.225.72   pengfei-node2   <none>           <none>

nodeSelector

指定pod调度到具有哪些标签的node节点上

#给node节点打标签,打个具有disk=ceph的标签
[root@pengfei-master1 pod]# kubectl label node pengfei-node2 disk=ceph
node/pengfei-node2 labeled

定义pod的时候指定要调度到具有disk=ceph标签的node上

1、删除前面创建的pod
[root@pengfei-master1 pod]# kubectl delete -f pod_node.yaml 
pod "tomcat-node" deleted
2、修改pod_node.yaml
[root@pengfei-master1 pod]# vim pod_node.yaml 
apiVersion: v1
kind: Pod
metadata:
  labels:
    app: tomcat-node
  name:
    tomcat-node
spec:
  nodeSelector: #指定标签选择器
    disk: ceph
#  nodeName: pengfei-node2 #注释nodeName
  containers:
  - name: tomcat-node
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080

3、创建pod

[root@pengfei-master1 pod]# kubectl get pods tomcat-node -owide
NAME          READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
tomcat-node   1/1     Running   0          27s   10.244.225.73   pengfei-node2   <none>           <none>  

可以看到调度到node2节点

4、假如没有匹配到,则会创建失败

修改为disk: ceph1
[root@pengfei-master1 pod]# kubectl get pods tomcat-node -o wide
NAME          READY   STATUS    RESTARTS   AGE   IP       NODE     NOMINATED NODE   READINESS GATES
tomcat-node   0/1     Pending   0          94s   <none>   <none>   <none>           <none>
Events:
  Type     Reason            Age   From               Message
  ----     ------            ----  ----               -------
  Warning  FailedScheduling  43s   default-scheduler  0/3 nodes are available: 1 node(s) had untolerated taint {node-role.kubernetes.io/control-plane: }, 3 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling.

可以看到没有匹配到规则,调度失败

删除node标签

删标签的方式是再标签后面加-,如下所示

[root@pengfei-master1 pod]# kubectl label node pengfei-node2 disk-
node/pengfei-node2 unlabeled

污点、容忍度、亲和性

node节点亲和性

node节点亲和性调度:nodeAffinity

查看affinity帮助

[root@pengfei-master1 pod]# kubectl explain pods.spec.affinity
KIND:     Pod
VERSION:  v1

RESOURCE: affinity <Object>

DESCRIPTION:
     If specified, the pod's scheduling constraints

     Affinity is a group of affinity scheduling rules.

FIELDS:
   nodeAffinity	<Object> #node亲和性
     Describes node affinity scheduling rules for the pod.

   podAffinity	<Object> #pod亲和性
     Describes pod affinity scheduling rules (e.g. co-locate this pod in the
     same node, zone, etc. as some other pod(s)).

   podAntiAffinity	<Object> #pod反亲和性
     Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod
     in the same node, zone, etc. as some other pod(s))
     

查看node亲和性

[root@pengfei-master1 pod]# kubectl explain pods.spec.affinity.nodeAffinity
KIND:     Pod
VERSION:  v1

RESOURCE: nodeAffinity <Object>

DESCRIPTION:
     Describes node affinity scheduling rules for the pod.

     Node affinity is a group of node affinity scheduling rules.

FIELDS:
   preferredDuringSchedulingIgnoredDuringExecution	<[]Object>#prefered表示有节点尽量满足这个位置定义的亲和性,这不是一个必须的条件,软亲和性
     The scheduler will prefer to schedule pods to nodes that satisfy the
     affinity expressions specified by this field, but it may choose a node that
     violates one or more of the expressions. The node that is most preferred is
     the one with the greatest sum of weights, i.e. for each node that meets all
     of the scheduling requirements (resource request, requiredDuringScheduling
     affinity expressions, etc.), compute a sum by iterating through the
     elements of this field and adding "weight" to the sum if the node matches
     the corresponding matchExpressions; the node(s) with the highest sum are
     the most preferred.

   requiredDuringSchedulingIgnoredDuringExecution	<Object> #require表示必须有节点满足这个位置定义的亲和性,这是个硬性条件,硬亲和性
     If the affinity requirements specified by this field are not met at
     scheduling time, the pod will not be scheduled onto the node. If the
     affinity requirements specified by this field cease to be met at some point
     during pod execution (e.g. due to an update), the system may or may not try
     to eventually evict the pod from its node.

查看matchExpressions帮助

[root@pengfei-master1 ~]# kubectl explain pod.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms.matchExpressions
KIND:     Pod
VERSION:  v1

RESOURCE: matchExpressions <[]Object>

DESCRIPTION:
     A list of node selector requirements by node's labels.

     A node selector requirement is a selector that contains values, a key, and
     an operator that relates the key and values.

FIELDS:
   key	<string> -required- #label名称
     The label key that the selector applies to.

   operator	<string> -required- #做等值选则还是不等值选则
     Represents a key's relationship to a set of values. Valid operators are In,
     NotIn, Exists, DoesNotExist. Gt, and Lt.

     Possible enum values:
     - `"DoesNotExist"`
     - `"Exists"`
     - `"Gt"`
     - `"In"`
     - `"Lt"`
     - `"NotIn"`

   values	<[]string> #label key值
     An array of string values. If the operator is In or NotIn, the values array
     must be non-empty. If the operator is Exists or DoesNotExist, the values
     array must be empty. If the operator is Gt or Lt, the values array must
     have a single element, which will be interpreted as an integer. This array
     is replaced during a strategic merge patch

测试node节点硬亲和性

使用requiredDuringSchedulingIgnoredDuringExecution

[root@pengfei-master1 nodes]# cat pod_affinity_demo.yaml
apiVersion: v1
kind: Pod
metadata:
  name: node-affinity-demo
  labels:
    app: myapp
spec:
  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: zone
            operator: In
            values:
            - foo
            - bar
  containers:
  - name: myapp
    image: nginx
    imagePullPolicy: IfNotPresent

当前节点中有任意一个节点拥有zone标签的值是foo或者bar,就可以把pod调度到这个node节点上

[root@pengfei-master1 nodes]# kubectl get nodes --show-labels
NAME              STATUS   ROLES           AGE   VERSION   LABELS
pengfei-master1   Ready    control-plane   9d    v1.25.0   beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,kubernetes.io/arch=amd64,kubernetes.io/hostname=pengfei-master1,kubernetes.io/os=linux,node-role.kubernetes.io/control-plane=,node.kubernetes.io/exclude-from-external-load-balancers=
pengfei-node1     Ready    work            9d    v1.25.0   beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,disk=ceph,kubernetes.io/arch=amd64,kubernetes.io/hostname=pengfei-node1,kubernetes.io/os=linux,node-role.kubernetes.io/work=work
pengfei-node2     Ready    work            9d    v1.25.0   beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,kubernetes.io/arch=amd64,kubernetes.io/hostname=pengfei-node2,kubernetes.io/os=linux,node-role.kubernetes.io/work=work

集群中没有zone标签,pod调度失败

[root@pengfei-master1 nodes]# kubectl get pods -owide
NAME                 READY   STATUS    RESTARTS   AGE     IP       NODE     NOMINATED NODE   READINESS GATES
node-affinity-demo   0/1     Pending   0          5m30s   <none>   <none>   <none>           <none>
#status的状态是pending,说明没有完成调度,因为没有一个拥有zone的标签的值是foo或者bar,而且使用的是硬亲和性,必须满足条件才能完成调度

[root@pengfei-master1 nodes]# kubectl describe pods node-affinity-demo
Events:
  Type     Reason            Age                   From               Message
  ----     ------            ----                  ----               -------
  Warning  FailedScheduling  105s (x2 over 6m51s)  default-scheduler  0/3 nodes are available: 1 node(s) had untolerated taint {node-role.kubernetes.io/control-plane: }, 3 node(s) didn't match Pod's node affinity/selector. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling

给node节点打上标签,重新调度

[root@pengfei-master1 nodes]# kubectl label node pengfei-node1 zone=foo
node/pengfei-node1 labeled       
[root@pengfei-master1 nodes]# kubectl get pods -o wide
NAME                 READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
node-affinity-demo   1/1     Running   0          29s   10.244.128.76   pengfei-node1   <none>           <none>

测试权重weight

weight是相对权重,权重越高,pod调度的几率越大

      preferredDuringSchedulingIgnoredDuringExecution:
      - preference:
          matchExpressions:
          - key: zone1
            operator: In
            values:
            - foo1
            - bar1
        weight: 40 #权重
      - preference:
          matchExpressions:
          - key: zone2
            operator: In
            values:
            - foo2
            - bar2
        weight: 20
        
#zone1标签权重高于zone2

给pengfei-node1和pengfei-node2都打上标签

[root@pengfei-master1 nodes]# kubectl label node pengfei-node1 zone1=foo1
node/pengfei-node1 labeled
[root@pengfei-master1 nodes]# kubectl label node pengfei-node2 zone2=foo2
node/pengfei-node2 labeled

删除前面创建的pod,重新创建

[root@pengfei-master1 nodes]# kubectl delete -f pod_affinity_demo2.yaml #删除前面创建的pod,
[root@pengfei-master1 nodes]# kubectl apply -f pod_affinity_demo2.yaml 
pod/node-affinity-demo2 created

再次查看pod,被调度上了node1节点

[root@pengfei-master1 nodes]# kubectl get pods -o wide
NAME                  READY   STATUS    RESTARTS   AGE    IP              NODE            NOMINATED NODE   READINESS GATES
node-affinity-demo    1/1     Running   0          146m   10.244.128.76   pengfei-node1   <none>           <none>
node-affinity-demo2   1/1     Running   0          12s    10.244.128.77   pengfei-node1   <none>           <none>

pod在定义node节点亲和性的时候,pengfei-node1和pengfei-node2都满足条件,都可以调度pod,但是pengfei-node1具有的标签是#zone1=foo1,pod在匹配zone1=foo1的权重高,那么pod就会优先调度到pengfei-node1上

删除标签和pod

[root@pengfei-master1 nodes]# kubectl label node pengfei-node1 zone1-
node/pengfei-node1 unlabeled
[root@pengfei-master1 nodes]# kubectl label node pengfei-node2 zone2-
node/pengfei-node2 unlabeled
[root@pengfei-master1 nodes]# kubectl delete -f pod_affinity_demo.yaml 
pod "node-affinity-demo" deleted
[root@pengfei-master1 nodes]# kubectl delete -f pod_affinity_demo2.yaml 
pod "node-affinity-demo2" deleted

pod亲和性

pod自身的亲和性调度有两种表示形式
1、podaffinity:pod和pod更倾向腻在一起,把相近的pod结合到相近的位置,如同一区域,同一机架,这样的话pod和pod之间更好通信,比方说有两个机房,这两个机房部署的集群有1000台主机,那么我们希望把nginx和tomcat都部署同一个地方的node节点上,可以提高通信效率;

2、podunaffinity:pod和pod更倾向不腻在一起,如果部署两套程序,那么这两套程序更倾向于反亲和性,这样相互之间不会有影响。

第一个pod随机选则一个节点,做为评判后续的pod能否到达这个pod所在的节点上的运行方式,这就称为pod亲和性;我们怎么判定哪些节点是相同位置的,哪些节点是不同位置的;我们在定义pod亲和性时需要有一个前提,哪些pod在同一个位置,哪些pod不在同一个位置,这个位置是怎么定义的,标准是什么?以节点名称为标准,这个节点名称相同的表示是同一个位置,节点名称不相同的表示不是一个位置。

查看pod亲和性帮助

[root@pengfei-master1 nodes]# kubectl explain pods.spec.affinity.podAffinity
KIND:     Pod
VERSION:  v1

RESOURCE: podAffinity <Object>

DESCRIPTION:
     Describes pod affinity scheduling rules (e.g. co-locate this pod in the
     same node, zone, etc. as some other pod(s)).

     Pod affinity is a group of inter pod affinity scheduling rules.

FIELDS:
   preferredDuringSchedulingIgnoredDuringExecution	<[]Object> #软亲和性
     The scheduler will prefer to schedule pods to nodes that satisfy the
     affinity expressions specified by this field, but it may choose a node that
     violates one or more of the expressions. The node that is most preferred is
     the one with the greatest sum of weights, i.e. for each node that meets all
     of the scheduling requirements (resource request, requiredDuringScheduling
     affinity expressions, etc.), compute a sum by iterating through the
     elements of this field and adding "weight" to the sum if the node has pods
     which matches the corresponding podAffinityTerm; the node(s) with the
     highest sum are the most preferred.

   requiredDuringSchedulingIgnoredDuringExecution	<[]Object> #硬亲和性
     If the affinity requirements specified by this field are not met at
     scheduling time, the pod will not be scheduled onto the node. If the
     affinity requirements specified by this field cease to be met at some point
     during pod execution (e.g. due to a pod label update), the system may or
     may not try to eventually evict the pod from its node. When there are
     multiple elements, the lists of nodes corresponding to each podAffinityTerm
     are intersected, i.e. all terms must be satisfied
[root@pengfei-master1 nodes]# kubectl explain pods.spec.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution
KIND:     Pod
VERSION:  v1

......

FIELDS:
   labelSelector	<Object> #我们要判断pod跟别的pod亲和,跟哪个pod亲和,需要靠labelSelector,通过labelSelector选则一组能作为亲和对象的pod资源
     A label query over a set of resources, in this case pods.

   namespaceSelector	<Object>
     A label query over the set of namespaces that the term applies to. The term
     is applied to the union of the namespaces selected by this field and the
     ones listed in the namespaces field. null selector and null or empty
     namespaces list means "this pod's namespace". An empty selector ({})
     matches all namespaces.

   namespaces	<[]string> #labelSelector需要选则一组资源,那么这组资源是在哪个名称空间中呢,通过namespace指定,如果不指定namespaces,那么就是当前创建pod的名称空间
     namespaces specifies a static list of namespace names that the term applies
     to. The term is applied to the union of the namespaces listed in this field
     and the ones selected by namespaceSelector. null or empty namespaces list
     and null namespaceSelector means "this pod's namespace".

   topologyKey	<string> -required- #位置拓扑的键,这个是必须字段
     This pod should be co-located (affinity) or not co-located (anti-affinity)
     with the pods matching the labelSelector in the specified namespaces, where
     co-located is defined as running on a node whose value of the label with
     key topologyKey matches that of any node on which any of the selected pods
     is running. Empty topologyKey is not allowed.
     
     #怎么判断是不是同一个位置:
			#rack=rack1
			#row=row1
      #使用rack的键是同一个位置
      #使用row的键是同一个位置
[root@pengfei-master1 nodes]# kubectl explain pods.spec.affinity.podAffinity.requiredDuringSchedulingIgnoredDuringExecution.labelSelector
KIND:     Pod
VERSION:  v1

RESOURCE: labelSelector <Object>

DESCRIPTION:
     A label query over a set of resources, in this case pods.

     A label selector is a label query over a set of resources. The result of
     matchLabels and matchExpressions are ANDed. An empty label selector matches
     all objects. A null label selector matches no objects.

FIELDS:
   matchExpressions	<[]Object>
     matchExpressions is a list of label selector requirements. The requirements
     are ANDed.

   matchLabels	<map[string]string>
     matchLabels is a map of {key,value} pairs. A single {key,value} in the
     matchLabels map is equivalent to an element of matchExpressions, whose key
     field is "key", the operator is "In", and the values array contains only
     "value". The requirements are ANDed
[root@pengfei-master1 pod]# cat pod_affinity_first.yaml
apiVersion: v1
kind: Pod
metadata:
  name: myapp-1
  labels: 
    app: myapp-1

spec:
  containers:
  - name: myapp-1
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080
[root@pengfei-master1 pod]# kubectl apply -f pod_affinity_first.yaml
pod/myapp-1 created
[root@pengfei-master1 pod]# kubectl get pods -owide
NAME      READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
myapp-1   1/1     Running   0          13s   10.244.225.81   pengfei-node2   <none>           <none>
[root@pengfei-master1 pod]# kubectl apply -f pod_affinity_first.yaml
apiVersion: v1
kind: Pod
metadata:
  name: myapp-2
  labels: 
    app: myapp-2
spec:
  affinity:
    podAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
      - labelSelector:
          matchExpressions:
          - key: app
            operator: In
            values:
            - myapp-1 #指定myapp-2 pod创建到拥有myapp-1标签的pod上
        topologyKey: kubernetes.io/hostname 
  containers:
  - name: tomcat
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080
[root@pengfei-master1 pod]# kubectl apply -f pod_affinity_second.yaml 
pod/myapp-2 created
[root@pengfei-master1 pod]# kubectl get pods -o wide
NAME      READY   STATUS    RESTARTS   AGE     IP              NODE            NOMINATED NODE   READINESS GATES
myapp-1   1/1     Running   0          7m14s   10.244.225.81   pengfei-node2   <none>           <none>
myapp-2   1/1     Running   0          7s      10.244.225.82   pengfei-node2   <none>           <none>
#上面说明第一个pod调度到哪,第二个pod也调度到哪,这就是pod节点亲和性

删除pod

[root@pengfei-master1 pod]# kubectl delete -f pod_affinity_first.yaml 
pod "myapp-1" deleted
[root@pengfei-master1 pod]# kubectl delete -f pod_affinity_second.yaml 
pod "myapp-2" deleted

pod反亲和度

定义两个pod,第一个pod做为基准,第二个pod跟它调度节点相反

创建第一个pod

[root@pengfei-master1 pod]# cat pod_affinity_first.yaml
apiVersion: v1
kind: Pod
metadata:
  name: myapp-1
  labels: 
    app: myapp-1

spec:
  containers:
  - name: myapp-1
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080
[root@pengfei-master1 pod]# kubectl apply -f pod_affinity_first.yaml 
pod/myapp-1 created

创建第二个pod

[root@pengfei-master1 pod]# cat pod_ani_affinity_three.yaml 
apiVersion: v1
kind: Pod
metadata:
  labels:
    app: myapp-3
  name: myapp-3
spec:
  affinity:
    podAntiAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
      - labelSelector:
          matchExpressions:
          - key: app
            operator: In
            values:
            - myapp-1 #指定和myapp-1 pod反亲和性
        topologyKey: kubernetes.io/hostname
  containers:
  - name: myapp-3
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080
    
[root@pengfei-master1 pod]# kubectl apply -f pod_ani_affinity_three.yaml 
pod/myapp-3 created

查看pod

[root@pengfei-master1 pod]# kubectl get pods -owide
NAME      READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
myapp-1   1/1     Running   0          98s   10.244.225.83   pengfei-node2   <none>           <none>
myapp-3   1/1     Running   0          8s    10.244.128.78   pengfei-node1   <none>           <none>
#上面显示两个pod不在一个node节点上,这就是pod节点反亲和性

污点、容忍度

给了节点选则的主动权,我们给节点打一个污点,不容忍的pod就运行不上来,污点就是定义在节点上的键值属性数据,可以决定拒绝那些pod;
taints是键值数据,用在节点上,定义污点;
tolerations是键值数据,用在pod上,定义容忍度,能容忍哪些污点

pod亲和性是pod属性;但是污点是节点的属性,污点定义在k8s集群的节点上的一个字段

#查看master节点的污点

[root@xianchaomaster1 ~]# kubectl describe nodes pengfei-master
#查看master这个节点是否有污点,显示如下:
Taints:             node-role.kubernetes.io/control-plane:NoSchedule
#上面可以看到master这个节点的污点是Noschedule,所以我们创建的pod都不会调度到master上,因为我们创建的pod没有容忍度
[root@pengfei-master1 ~]# kubectl describe pods kube-apiserver-pengfei-master1 -n kube-system
#显示如下
Tolerations:       :NoExecute op=Exists
#可以看到这个pod的容忍度是NoExecute,则可以调度到pengfei-master1上
[root@pengfei-master1 ~]# kubectl explain node.spec.taints
KIND:     Node
VERSION:  v1

RESOURCE: taints <[]Object>

DESCRIPTION:
     If specified, the node's taints.

     The node this Taint is attached to has the "effect" on any pod that does
     not tolerate the Taint.

FIELDS:
   effect	<string> -required- #必须字段,用来定义pod对象的排斥等级
     Required. The effect of the taint on pods that do not tolerate the taint.
     Valid effects are NoSchedule, PreferNoSchedule and NoExecute.

     Possible enum values:
     - `"NoExecute"` Evict any already-running pods that do not tolerate the
     taint. Currently enforced by NodeController.
     #既影响调度过程,又影响现存的pod对象,如果现存的pod不能容忍节点后来加的污点,这个pod就会被驱逐
     - `"NoSchedule"` Do not allow new pods to schedule onto the node unless
     they tolerate the taint, but allow all pods submitted to Kubelet without
     going through the scheduler to start, and allow all already-running pods to
     continue running. Enforced by the scheduler.
     #仅影响pod调度过程,当pod能容忍这个节点污点,就可以调度到当前节点,后来这个节点的污点改了,加了一个新的污点,使得之前调度的pod不能容忍了,也不会被驱逐
     #只有拥有和这个污点相匹配的容忍度的 Pod 才能够被分配到
     
     - `"PreferNoSchedule"` Like TaintEffectNoSchedule, but the scheduler tries
     not to schedule new pods onto the node, rather than prohibiting new pods
     from scheduling onto the node entirely. Enforced by the scheduler.
			#尽量避免将Pod调度到具有该污点的Node上
   key	<string> -required-   #必须字段
     Required. The taint key to be applied to a node.

   timeAdded	<string>
     TimeAdded represents the time at which the taint was added. It is only
     written for NoExecute taints.

   value	<string>
     The taint value corresponding to the taint key.

管理节点污点

[root@pengfei-master1 ~]# kubectl taint -h

给pengfei-node2打上污点(NoSchedule)

[root@pengfei-master1 ~]# kubectl taint node pengfei-node2 node-type=production:NoSchedule

创建pod未指明容忍度

[root@pengfei-master1 pod]# cat> pod_taint.yaml<<EOF
apiVersion: v1
kind: Pod
metadata:
  labels:
    app: pod-taint
  name: nginx-taint
spec:
  containers:
  - name: nginx-taint
    image: nginx
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 80
EOF

创建pod

[root@pengfei-master1 pod]# kubectl apply -f pod_taint.yaml
[root@pengfei-master1 pod]# kubectl get pods -owide
NAME          READY   STATUS    RESTARTS      AGE   IP              NODE            NOMINATED NODE   READINESS GATES
myapp-3       1/1     Running   1 (24h ago)   40h   10.244.128.80   pengfei-node1   <none>           <none>
nginx-taint   1/1     Running   0             11s   10.244.128.82   pengfei-node1   <none>           <none>

#上面结果可以看出pod被调度到了pengfei-node1上,因为pengfei-node2这个节点打了污点,而我们在创建pod的时候没有容忍度,所以pengfei-node2上不会有pod调度上去的

给pengfei-node1也打上污点(NoExecute)

[root@pengfei-master1 pod]# kubectl taint node pengfei-node1 node-type=dev:NoExecute
node/pengfei-node1 tainted
[root@pengfei-master1 pod]# kubectl get pods
No resources found in default namespace.
#可以看到pengfei-node1上已经存在的pod被驱逐走了

tolerations是键值数据,用在pod上,定义容忍度,能容忍哪些污点

[root@pengfei-master1 pod]# kubectl explain pod.spec.tolerations
KIND:     Pod
VERSION:  v1

RESOURCE: tolerations <[]Object>

DESCRIPTION:
     If specified, the pod's tolerations.

     The pod this Toleration is attached to tolerates any taint that matches the
     triple <key,value,effect> using the matching operator <operator>.

FIELDS:
   effect	<string>
     Effect indicates the taint effect to match. Empty means match all taint
     effects. When specified, allowed values are NoSchedule, PreferNoSchedule
     and NoExecute.

     Possible enum values:
     - `"NoExecute"` Evict any already-running pods that do not tolerate the
     taint. Currently enforced by NodeController.
     - `"NoSchedule"` Do not allow new pods to schedule onto the node unless
     they tolerate the taint, but allow all pods submitted to Kubelet without
     going through the scheduler to start, and allow all already-running pods to
     continue running. Enforced by the scheduler.
     - `"PreferNoSchedule"` Like TaintEffectNoSchedule, but the scheduler tries
     not to schedule new pods onto the node, rather than prohibiting new pods
     from scheduling onto the node entirely. Enforced by the scheduler.

   key	<string>
     Key is the taint key that the toleration applies to. Empty means match all
     taint keys. If the key is empty, operator must be Exists; this combination
     means to match all values and all keys.

   operator	<string>
     Operator represents a key's relationship to the value. Valid operators are
     Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for
     value, so that a pod can tolerate all taints of a particular category.

     Possible enum values:
     - `"Equal"`
     - `"Exists"`

   tolerationSeconds	<integer>
     TolerationSeconds represents the period of time the toleration (which must
     be of effect NoExecute, otherwise this field is ignored) tolerates the
     taint. By default, it is not set, which means tolerate the taint forever
     (do not evict). Zero and negative values will be treated as 0 (evict
     immediately) by the system.

   value	<string>
     Value is the taint value the toleration matches to. If the operator is
     Exists, the value should be empty, otherwise just a regular string.

编写yaml

[root@pengfei-master1 pod]# cat >pod_taint_demo1.yaml<<EOF
apiVersion: v1
kind: Pod
metadata:
  name: pod-taint1
  labels:
    app: my-taint1
spec:
  tolerations:
  - key: "node-type"
    operator: "Equal"
    effect: "NoExecute"
    value: "production"
    tolerationSeconds: 3600
  containers:
  - name: pod-taint1
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080 
EOF
[root@pengfei-master1 pod]# kubectl get pods -o wide
NAME         READY   STATUS    RESTARTS   AGE   IP       NODE     NOMINATED NODE   READINESS GATES
pod-taint1   0/1     Pending   0          46s   <none>   <none>   <none>           <none>
#一直是显示pending,因为我们使用的是equal(等值匹配),所以key和value,effect必须和node节点定义的污点完全匹配才可以

把上面配置effect: “NoExecute”变成effect: “NoSchedule” ;tolerationSeconds: 3600去掉

[root@pengfei-master1 pod]# kubectl delete -f pod_taint_demo1.yaml
[root@pengfei-master1 pod]# cat >pod_taint_demo1.yaml<<EOF
apiVersion: v1
kind: Pod
metadata:
  name: pod-taint1
  labels:
    app: my-taint1
spec:
  tolerations:
  - key: "node-type"
    operator: "Equal"
    effect: "NoSchedule"
    value: "production"
  containers:
  - name: pod-taint1
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080 
EOF

更新pod

[root@pengfei-master1 pod]# kubectl apply -f pod_taint_demo1.yaml
[root@pengfei-master1 pod]# kubectl get pods -o wide
NAME         READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
pod-taint1   1/1     Running   0          5s    10.244.225.86   pengfei-node2   <none>           <none>
#可以调度到pengfei-node2上了,因为在pod中定义的容忍度能容忍node节点上的污点

再次修改operator: “Exists”和value: “”

[root@pengfei-master1 pod]# kubectl delete -f pod_taint_demo1.yaml
[root@pengfei-master1 pod]# cat >pod_taint_demo1.yaml<<EOF
apiVersion: v1
kind: Pod
metadata:
  name: pod-taint1
  labels:
    app: my-taint1
spec:
  tolerations:
  - key: "node-type"
    operator: "Exists"
    effect: "NoSchedule"
    value: ""
  containers:
  - name: pod-taint1
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080 
EOF
[root@pengfei-master1 pod]# kubectl apply -f pod_taint_demo1.yaml 
pod/pod-taint1 created
[root@pengfei-master1 pod]# kubectl get pods
NAME         READY   STATUS    RESTARTS   AGE
pod-taint1   1/1     Running   0          3s
[root@pengfei-master1 pod]# kubectl get pods -o wide
NAME         READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
pod-taint1   1/1     Running   0          7s    10.244.128.83   pengfei-node1   <none>           <none>
#只要对应的键是存在的,exists,其值被自动定义成通配符

再次修改effect: “”

[root@pengfei-master1 pod]# kubectl delete -f pod_taint_demo1.yaml
[root@pengfei-master1 pod]# cat >pod_taint_demo1.yaml<<EOF
apiVersion: v1
kind: Pod
metadata:
  name: pod-taint1
  labels:
    app: my-taint1
spec:
  tolerations:
  - key: "node-type"
    operator: "Exists"
    effect: ""
    value: ""
  containers:
  - name: pod-taint1
    image: tomcat
    imagePullPolicy: IfNotPresent
    ports:
    - containerPort: 8080 
EOF
[root@pengfei-master1 pod]# kubectl get pods -o wide
NAME         READY   STATUS    RESTARTS   AGE   IP              NODE            NOMINATED NODE   READINESS GATES
pod-taint1   1/1     Running   0          3s    10.244.128.84   pengfei-node1   <none>           <none>
#有一个node-type的键,不管值是什么,不管是什么效果,都能容忍
#2个节点上都有可能有pod被调度

删除污点

[root@pengfei-master1 pod]# kubectl taint node pengfei-node1 node-type=dev:NoExecute-
[root@pengfei-master1 pod]# kubectl taint node pengfei-node2 node-type=production:NoSchedule-

官方文章

Comments

No comments yet. Why don’t you start the discussion?

发表评论