diff --git a/amdgpu.go b/amdgpu.go index 411163756aa4a64450552c9484e8d820e1767f23..0a541503c681d4424940db6d8cdcff6cd04b1297 100644 --- a/amdgpu.go +++ b/amdgpu.go @@ -38,23 +38,29 @@ import ( ) // GetAMDGPUs return a map of AMD GPU on a node identified by the part of the pci address -func GetAMDGPUs() map[string]string { +func GetAMDGPUs() map[string]map[string]int { if _, err := os.Stat("/sys/module/amdgpu/drivers/"); err != nil { glog.Warningf("amdgpu driver unavailable: %s", err) - return make(map[string]string) + return make(map[string]map[string]int) } //ex: /sys/module/amdgpu/drivers/pci:amdgpu/0000:19:00.0 matches, _ := filepath.Glob("/sys/module/amdgpu/drivers/pci:amdgpu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*") - devices := make(map[string]string) + devices := make(map[string]map[string]int) for _, path := range matches { glog.Info(path) - cardPath, _ := filepath.Glob(path + "/drm/card*") - - if len(cardPath) > 0 { - devices[filepath.Base(path)] = filepath.Base(cardPath[0]) + devPaths, _ := filepath.Glob(path + "/drm/*") + devices[filepath.Base(path)] = make(map[string]int) + + for _, devPath := range devPaths { + switch name := filepath.Base(devPath); { + case name[0:4] == "card": + devices[filepath.Base(path)][name[0:4]], _ = strconv.Atoi(name[4:]) + case name[0:7] == "renderD": + devices[filepath.Base(path)][name[0:7]], _ = strconv.Atoi(name[7:]) + } } } return devices diff --git a/amdgpu_test.go b/amdgpu_test.go index a18cca7b5dba0d24285276f5787d6ff2717ce21f..f21a31e4d65a88e5f37c2de1f25c93dfeefd319f 100644 --- a/amdgpu_test.go +++ b/amdgpu_test.go @@ -17,6 +17,7 @@ package main import ( + "fmt" "io/ioutil" "path/filepath" "strings" @@ -39,7 +40,8 @@ func TestAMDGPUFirmwareVersionConsistent(t *testing.T) { devices := GetAMDGPUs() - for pci, card := range devices { + for pci, dev := range devices { + card := fmt.Sprintf("card%d", dev["card"]) t.Logf("%s, %s", pci, card) //debugfs path/interface may not be stable @@ -103,7 +105,9 @@ func TestAMDGPUDevFunctional(t *testing.T) { devices := GetAMDGPUs() - for _, card := range devices { + for _, dev := range devices { + card := fmt.Sprintf("card%d", dev["card"]) + ret := AMDGPUDevFunctional(card) t.Logf("%s functional: %t", card, ret) } diff --git a/main.go b/main.go index 1c9f6d06b82266eb29b7cfa5fbd697d57175d724..803ddd6fdad8a57e895fe723cb02e40690277064 100644 --- a/main.go +++ b/main.go @@ -35,6 +35,7 @@ import ( // Plugin is identical to DevicePluginServer interface of device plugin API. type Plugin struct { + AMDGPUs map[string]map[string]int Heartbeat chan bool } @@ -123,15 +124,17 @@ func (p *Plugin) PreStartContainer(ctx context.Context, r *pluginapi.PreStartCon // Whenever a Device state change or a Device disappears, ListAndWatch // returns the new list func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { - devCount := countGPUDevFromTopology() + p.AMDGPUs = GetAMDGPUs() - devs := make([]*pluginapi.Device, devCount) + devs := make([]*pluginapi.Device, len(p.AMDGPUs)) - for i := 0; i < devCount; i++ { + i := 0 + for id := range p.AMDGPUs { devs[i] = &pluginapi.Device{ - ID: fmt.Sprintf("gpu%d", i), + ID: id, Health: pluginapi.Healthy, } + i++ } s.Send(&pluginapi.ListAndWatchResponse{Devices: devs}) @@ -146,7 +149,7 @@ func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListA health = pluginapi.Healthy } - for i := 0; i < devCount; i++ { + for i := 0; i < len(p.AMDGPUs); i++ { devs[i].Health = health } s.Send(&pluginapi.ListAndWatchResponse{Devices: devs}) @@ -159,23 +162,36 @@ func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListA // Plugin can run device specific operations and instruct Kubelet // of the steps to make the Device available in the container func (p *Plugin) Allocate(ctx context.Context, r *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { - car := new(pluginapi.ContainerAllocateResponse) - - // Currently, there are only 1 /dev/kfd per nodes regardless of the # of GPU available - dev := new(pluginapi.DeviceSpec) - dev.HostPath = "/dev/kfd" - dev.ContainerPath = "/dev/kfd" - dev.Permissions = "rw" - car.Devices = append(car.Devices, dev) - - dev = new(pluginapi.DeviceSpec) - dev.HostPath = "/dev/dri" - dev.ContainerPath = "/dev/dri" - dev.Permissions = "rw" - car.Devices = append(car.Devices, dev) - var response pluginapi.AllocateResponse - response.ContainerResponses = append(response.ContainerResponses, car) + var car pluginapi.ContainerAllocateResponse + var dev *pluginapi.DeviceSpec + + for _, req := range r.ContainerRequests { + car = pluginapi.ContainerAllocateResponse{} + + // Currently, there are only 1 /dev/kfd per nodes regardless of the # of GPU available + // for compute/rocm/HSA use cases + dev = new(pluginapi.DeviceSpec) + dev.HostPath = "/dev/kfd" + dev.ContainerPath = "/dev/kfd" + dev.Permissions = "rw" + car.Devices = append(car.Devices, dev) + + for _, id := range req.DevicesIDs { + glog.Infof("Allocating device ID: %s", id) + + for k, v := range p.AMDGPUs[id] { + devpath := fmt.Sprintf("/dev/dri/%s%d", k, v) + dev = new(pluginapi.DeviceSpec) + dev.HostPath = devpath + dev.ContainerPath = devpath + dev.Permissions = "rw" + car.Devices = append(car.Devices, dev) + } + } + + response.ContainerResponses = append(response.ContainerResponses, &car) + } return &response, nil }