Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions MPSUserGuide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# How to start MPS

1. Start MPS daemon in node
1.1 export CUDA_VISIBLE_DEVICES=ID
> Specify which GPU’s should be visible to a CUDA application.

1.2 export CUDA_MPS_PIPE_DIRECTORY=Directory
> The MPS control daemon, the MPS server, and the associated MPS clients communicate with each other via named pipes and UNIX domain sockets.
The default directory for these pipes and sockets is /tmp/nvidia-mps.
CUDA_MPS_PIPE_DIRECTORY, can be used to override the location of these pipes and sockets.

1.3 export CUDA_MPS_LOG_DIRECTORY=Directory
> The MPS control daemon maintains a control.log file and server.log file in the directory.

1.4 nvidia-smi -i ID -c EXCLUSIVE_PROCESS
> Three Compute Modes are supported via settings accessible in nvidia-smi.PROHIBITED ,EXCLUSIVE_PROCESS,DEFAULT.Make sure your GPU is in EXCLUSIVE_PROCESS mode.

1.5 nvidia-cuda-mps-control -d
> start MPS daemon

2. Add additional information in yaml

2.1 set hostIPC=true in podspec
> spec:
hostIPC: true

2.2 add environment information
* CUDA_MPS_ACTIVE_THREAD_PERCENTAGE="number" //0-100
> setting this in a MPS client’s environment will constraint the portion of available threads of each device.
* CUDA_MPS_PIPE_DIRECTORY=Directory
> Make sure this directory is same as what you set on node.

2.3 add volume information
* volumeMount
> The same as CUDA_MPS_PIPE_DIRECTORY set on node.
* volumes
> hostPath the same as CUDA_MPS_PIPE_DIRECTORY set on node.

2.4 A example of addtional infromation when I set CUDA_MPS_PIPE_DIRECTORY=/root/nvidia-mps
![example](https://ws3.sinaimg.cn/large/006tNc79ly1g4tqjrvr8rj30ou0f075w.jpg)



3 changes: 2 additions & 1 deletion cmd/nvidia/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ var (
mps = flag.Bool("mps", false, "Enable or Disable MPS")
healthCheck = flag.Bool("health-check", false, "Enable or disable Health check")
memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'")
mpspipe = flag.String("mps-pipe", "/tmp/nvidia-mps", " pipes and UNIX domain sockets")
)

func main() {
flag.Parse()
log.V(1).Infoln("Start gpushare device plugin")
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit))
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, *mpspipe, translatememoryUnits(*memoryUnit))
err := ngm.Run()
if err != nil {
log.Fatalf("Failed due to %v", err)
Expand Down
2 changes: 2 additions & 0 deletions device-plugin-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ spec:
- gpushare-device-plugin-v2
- -logtostderr
- --v=5
# - --mps-pipe=/root/nvidia-mps // mps-client and mps-server communicate through the directory.You can modify it.
# - --mps=true //if you want to use mps
- --memory-unit=GiB
resources:
limits:
Expand Down
29 changes: 21 additions & 8 deletions pkg/gpu/nvidia/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,22 @@ func init() {
kubeInit()
}

func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
func (m *NvidiaDevicePlugin) buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
responses := pluginapi.AllocateResponse{}
for _, req := range reqs.ContainerRequests {
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
envNVGPU: fmt.Sprintf("no-gpu-has-%dMiB-to-run", podReqGPU),
envNVGPU: fmt.Sprintf("no-gpu-has-%dGiB-to-run", podReqGPU),
EnvResourceIndex: fmt.Sprintf("-1"),
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))),
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
},
}
if m.mps {
response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*uint(len(req.DevicesIDs))/getGPUMemory())
response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe)
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}
return &responses
Expand Down Expand Up @@ -62,7 +66,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
pods, err := getCandidatePods()
if err != nil {
log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}

if log.V(4) {
Expand Down Expand Up @@ -106,7 +110,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
}

if id < 0 {
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}

// 1. Create container requests
Expand All @@ -121,6 +125,15 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
},
}
if m.mps {
response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*reqGPU/getGPUMemory())
response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe)
mount := pluginapi.Mount{
ContainerPath: m.mpspipe,
HostPath: m.mpspipe,
}
response.Mounts = append(response.Mounts, &mount)
}
responses.ContainerResponses = append(responses.ContainerResponses, &response)
}

Expand All @@ -134,25 +147,25 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
pod, err := clientset.CoreV1().Pods(assumePod.Namespace).Get(assumePod.Name, metav1.GetOptions{})
if err != nil {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}
newPod = updatePodAnnotations(pod)
_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
if err != nil {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}
} else {
log.Warningf("Failed due to %v", err)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}
}

} else {
log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.",
podReqGPU)
// return &responses, fmt.Errorf("invalid allocation requst: request GPU memory %d can't be satisfied", reqGPU)
return buildErrResponse(reqs, podReqGPU), nil
return m.buildErrResponse(reqs, podReqGPU), nil
}

log.Infof("new allocated GPUs info %v", &responses)
Expand Down
18 changes: 10 additions & 8 deletions pkg/gpu/nvidia/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,16 @@ const (
containerLogPathLabelKey = "io.kubernetes.container.logpath"
sandboxIDLabelKey = "io.kubernetes.sandbox.id"

envNVGPU = "NVIDIA_VISIBLE_DEVICES"
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
envNVGPU = "NVIDIA_VISIBLE_DEVICES"
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
EnvMPSPipeDirectory = "CUDA_MPS_PIPE_DIRECTORY"
EnvMPSActiveThreadPercentage = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"

GiBPrefix = MemoryUnit("GiB")
MiBPrefix = MemoryUnit("MiB")
Expand Down
6 changes: 4 additions & 2 deletions pkg/gpu/nvidia/gpumanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ import (
type sharedGPUManager struct {
enableMPS bool
healthCheck bool
mpspipe string
}

func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager {
func NewSharedGPUManager(enableMPS, healthCheck bool, mpspipe string, bp MemoryUnit) *sharedGPUManager {
metric = bp
return &sharedGPUManager{
enableMPS: enableMPS,
healthCheck: healthCheck,
mpspipe: mpspipe,
}
}

Expand Down Expand Up @@ -61,7 +63,7 @@ L:
devicePlugin.Stop()
}

devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck)
devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.mpspipe)
if err := devicePlugin.Serve(); err != nil {
log.Warningf("Failed to start device plugin due to %v", err)
} else {
Expand Down
9 changes: 5 additions & 4 deletions pkg/gpu/nvidia/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type NvidiaDevicePlugin struct {
devIndxMap map[uint]string
socket string
mps bool
mpspipe string
healthCheck bool

stop chan struct{}
Expand All @@ -32,7 +33,7 @@ type NvidiaDevicePlugin struct {
}

// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin {
func NewNvidiaDevicePlugin(mps, healthCheck bool, mpspipe string) *NvidiaDevicePlugin {
devs, devNameMap := getDevices()
devList := []string{}

Expand All @@ -54,10 +55,10 @@ func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin {
devNameMap: devNameMap,
socket: serverSock,
mps: mps,
mpspipe: mpspipe,
healthCheck: healthCheck,

stop: make(chan struct{}),
health: make(chan *pluginapi.Device),
stop: make(chan struct{}),
health: make(chan *pluginapi.Device),
}
}

Expand Down