Kubesphere中新增Kubernetes的GPU node节点

1、系统环境

Centos7.6 GPU V100 2块

2、新增node节点

2.1、开启GPU

[root@master conf]# vi common.yaml
# Container Engine Acceleration
# Use nvidia gpu acceleration in containers
# KubeSphere currently support Nvidia GPU V100 P100 1060 1080 1080Ti
# The driver version is 387.26，cuda is 9.1
nvidia_accelerator_enabled: true
nvidia_gpu_nodes:
  - gpu-node01

2.2、在hosts.ini中添加node节点信息

## Add other registry.
docker_registry_mirrors:
[root@master1 conf]# ls
common.yaml  common.yaml_bak  hosts.ini  hosts.ini_bak  plugin-aliyun.yaml  plugin-qingcloud.yaml  plugin-tencentcloud.yaml
[root@master1 conf]# cat hosts.ini
; Parameters:
;  ansible_connection: connection type to the target machine
;  ansible_host: the host name of the target machine
;  ip: ip address of the target machine
;  ansible_user: the default user name for ssh connection
;  ansible_ssh_pass: the password for ssh connection
;  ansible_become_pass: the privilege escalation password to grant access
;  ansible_port: the ssh port number, if not 22

; If installer is ran as non-root user who has sudo privilege, refer to the following sample configuration:
; e.g
;  master ansible_connection=local  ip=192.168.0.5  ansible_user=ubuntu  ansible_become_pass=Qcloud@123
;  node1  ansible_host=192.168.0.6  ip=192.168.0.6  ansible_user=ubuntu  ansible_become_pass=Qcloud@123
;  node2  ansible_host=192.168.0.8  ip=192.168.0.8  ansible_user=ubuntu  ansible_become_pass=Qcloud@123

; As recommended as below sample configuration, use root account by default to install

[all]
master1 ansible_connection=local  ip=192.168.0.6
node1  ansible_host=192.168.0.6  ip=192.168.0.6  ansible_user=ubuntu  ansible_become_pass=Qcloud@123
node2  ansible_host=192.168.0.8  ip=192.168.0.8  ansible_user=ubuntu  ansible_become_pass=Qcloud@123
gpu-node ansible_host=10.6.209.62  ip=10.6.209.62  ansible_ssh_pass=Uc!1X9MvzA

[local-registry]
master1

[kube-master]
master1
[kube-node]
#node1
#node2
gpu-node

[etcd]
master1

[k8s-cluster:children]
kube-node
kube-master

2.3、开始执行添加node节点脚本

[root@master scripts]# pwd
/root/kubesphere-all-offline-v2.1.1/scripts
[root@master scripts]# ./add-nodes.sh

坐等添加成功，后查看GPU节点为NotReady状态，查看节点上的服务都不正常，查看日志发现没有/usr/bin/nvidia-container-runtime

3、安装GPU驱动和nvidia docker 插件

3.1、安装nvidia docker 插件(由于没网络，选择离线安装，安装包已经下载)

[root@gpu-node01 tmp]# ls *.rpm
libnvidia-container1-1.2.0-1.x86_64.rpm       nvidia-container-runtime-3.3.0-1.x86_64.rpm  nvidia-docker2-2.4.0-1.noarch.rpm
libnvidia-container-tools-1.2.0-1.x86_64.rpm  nvidia-container-toolkit-1.2.1-2.x86_64.rpm

[root@gpu-node01 tmp]# yum install -y libnvidia-container*
[root@gpu-node01 tmp]# yum install -y nvidia-container*
[root@gpu-node01 tmp]# yum install -y nvidia-docker*

3.2、下载GPU驱动

[root@gpu-node01 tmp]# wget https://cn.download.nvidia.cn/tesla/418.116.00/NVIDIA-Linux-x86_64-418.116.00.run

3.3、禁用nouveau

[root@gpu-node01 ~]#vi /etc/modprobe.d/nouveau-blacklist.conf
blacklist nouveau
options nouveau modeset=0

备份原始image
[root@gpu-node01 ~]# mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r)-nouveau.img
生成新的内核镜像
[root@gpu-node01 ~]# dracut /boot/initramfs-$(uname -r).img $(uname -r)

重启服务器
[root@gpu-node01 ~]# reboot

3.4、安装GPU驱动

[root@gpu-node01 ~]# ./NVIDIA-Linux-x86_64-418.116.00.run --kernel-source-path=/usr/src/kernels/3.10.0-1127.19.1.el7.x86_64/ -k $(uname -r)

3.5、docker服务添加参数

[root@gpu-node01 tmp]# vi /etc/systemd/system/docker.service
[Unit]
Description=Docker Application Container Engine
Documentation=http://docs.docker.com
After=network.target  containerd.service


[Service]
Type=notify
Environment=GOTRACEBACK=crash
ExecReload=/bin/kill -s HUP $MAINPID
Delegate=yes
KillMode=process
ExecStart=/usr/bin/dockerd
          --add-runtime nvidia=/usr/bin/nvidia-container-runtime
          --default-runtime nvidia 
          $DOCKER_OPTS
          $DOCKER_STORAGE_OPTIONS
          $DOCKER_NETWORK_OPTIONS
          $INSECURE_REGISTRY
LimitNOFILE=1048576
LimitNPROC=1048576
LimitCORE=infinity
TimeoutStartSec=1min
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s

[Install]
WantedBy=multi-user.target

3.6、kubelet服务器添加GPU参数

[root@gpu-node01 tmp]# vi /etc/systemd/system/kubelet.service
[Unit]
Description=Kubernetes Kubelet Server
Documentation=https://github.com/GoogleCloudPlatform/kubernetes
After=docker.service
Wants=docker.socket

[Service]
User=root
EnvironmentFile=-/etc/kubernetes/kubelet.env
ExecStart=/usr/local/bin/kubelet
                $KUBE_LOGTOSTDERR
                $KUBE_LOG_LEVEL
                $KUBELET_API_SERVER
                $KUBELET_ADDRESS
                $KUBELET_PORT
                $KUBELET_HOSTNAME
                $KUBELET_ARGS
                $DOCKER_SOCKET
                $KUBELET_NETWORK_PLUGIN
                $KUBELET_VOLUME_PLUGIN
                $KUBELET_CLOUDPROVIDER
                $KUBELET_GPU_ARGS
Restart=always
RestartSec=10s

[Install]
WantedBy=multi-user.target

3.7、重启docker服务和kubelet服务

[root@gpu-node01 tmp]# systemctl daemon-reload
[root@gpu-node01 tmp]# systemctl restart docker
[root@gpu-node01 tmp]# systemctl restart kubelet

3.8、安装k8s调度gpu的插件

[root@master opt]# wget https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.6.0/nvidia-device-plugin.yml
# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.


# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at


#     http://www.apache.org/licenses/LICENSE-2.0


# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: nvidia-device-plugin-daemonset
  namespace: kube-system
spec:
  selector:
    matchLabels:
      name: nvidia-device-plugin-ds
  updateStrategy:
    type: RollingUpdate
  template:
    metadata:
      # This annotation is deprecated. Kept here for backward compatibility
      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
      annotations:
        scheduler.alpha.kubernetes.io/critical-pod: ""
      labels:
        name: nvidia-device-plugin-ds
    spec:
      tolerations:
      # This toleration is deprecated. Kept here for backward compatibility
      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
      - key: CriticalAddonsOnly
        operator: Exists
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
      # Mark this pod as a critical add-on; when enabled, the critical add-on
      # scheduler reserves resources for critical add-on pods so that they can
      # be rescheduled after a failure.
      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
      priorityClassName: "system-node-critical"
      containers:
      - image: registry.uih/library/nvidia/k8s-device-plugin:1.0.0-beta6
        name: nvidia-device-plugin-ctr
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop: ["ALL"]
        volumeMounts:
          - name: device-plugin
            mountPath: /var/lib/kubelet/device-plugins
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins

在master节点上运行，DaemonSet模式运行

[root@master ~]# kubectl get po --all-namespaces |grep nvi
kube-system                    nvidia-device-plugin-daemonset-cwlsq                              1/1     Running             0          8d
kube-system                    nvidia-device-plugin-daemonset-p4b4d                              1/1     Running             0          8d
kube-system                    nvidia-device-plugin-daemonset-p5h4c                              1/1     Running             0          8d
kube-system                    nvidia-device-plugin-daemonset-xjbxj                              1/1     Running             0          8d

4、配置GPU节点调度策略

4.1、该节点只允许GPU的服务运行在该节点上

给该GPU节点打一个taint污点标签,只有拥有和这个 taint 相匹配的 toleration 的 pod 才能够被分配到这个节点
[root@master tmp]# kubectl taint nodes gpu-node01 nvidia.com/gpu=true:NoSchedule

4.2、GPU服务只允许运行在gpu服务器上

查看GPU节点的node标签[nvidia.com/gpu=true]
[root@master tmp]# kubectl get node --show-labels | grep true
gpu-node01   Ready    worker   2d1h   v1.16.7   beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,kubernetes.io/arch=amd64,kubernetes.io/hostname=gpu-node01,kubernetes.io/os=linux,node-role.kubernetes.io/worker=,nvidia.com/gpu=true

4.3、GPU服务加上两点

4.3.1、只允许调度在带有nvidia.com/gpu=true 标签的节点上

nodeSelector: {
    "nvidia.com/gpu": "true"
  }

nodeSelector可以通过打标签的形式让Pod被调度到指定的Node上

4.3.2、允许GPU节点接受为如下tolerations的pod服务

tolerations:
  - key: "nvidia.com/gpu"
    operator: "Exists"
    effect: NoSchedule

NoSchedule：只有拥有和这个 taint 相匹配的 toleration 的 pod 才能够被分配到这个节点

5、ChatbotAI服务helm values 的完整配置

# Default values for ChatbotAI.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

replicaCount: 2

image:
  repository: registry.uih/com.uih.uplus/chatbot_algor_service_feature
  tag: v1.1
  pullPolicy: IfNotPresent


service:
  port: 9890
  type: NodePort
  nodePort: 30890

ingress:
  enabled: false
  annotations: {}
    # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
  hosts:
[root@master ai-chatbot]#
[root@master ai-chatbot]# cat values.yaml
# Default values for ChatbotAI.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

replicaCount: 2

image:
  repository: registry.uih/com.uih.uplus/chatbot_algor_service_feature
  tag: v1.1
  pullPolicy: IfNotPresent


service:
  port: 9890
  type: NodePort
  nodePort: 30890

ingress:
  enabled: false
  annotations: {}
    # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
  hosts:
    - host: chart-example.local
      paths: []

tls: []
  #  - secretName: chart-example-tls
  #    hosts:
  #      - chart-example.local

resources: {}
  # We usually recommend not to specify default resources and to leave this as a conscious
  # choice for the user. This also increases chances charts run on environments with little
  # resources, such as Minikube. If you do want to specify resources, uncomment the following
  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
  #  limits:
   #   nvidia.com/gpu: 1

nodeSelector: {
    "nvidia.com/gpu": "true"
  }

tolerations:
  - key: "nvidia.com/gpu"
    operator: "Exists"
    effect: NoSchedule

affinity: {}

6、总结：

1、GPU节点只允许需要的GPU服务允许在该节点上，通过taint 的 tolerations 来控制，保证带有如下taint标签的服务才能在GPU节点上运行
tolerations:
- key: "nvidia.com/gpu"
    operator: "Exists"
    effect: NoSchedule
2、所有的GPU服务只允许在GPU节点上运行，通过如下节点标签来控制
nodeSelector: {
    "nvidia.com/gpu": "true"
}

最终达到我们所要的结果：GPU服务每次都调度到GPU节点上，GPU节点只允许GPU服务运行在该节点上