Skip to content
Snippets Groups Projects
Commit aeb60ad5 authored by Kenny Ho's avatar Kenny Ho
Browse files

Add integration with libdrm

Using libdrm to talk to the amdgpu driver.  It is not used for anything
critical at this point but it will be used to configure for things like
GPU hung detection, etc.

Using two different ways to obtain amdgpu firmware version as a way to
validate libdrm integration.  The same firmware version can be obtained
via debugfs from /sys/kernel/debug/dri/(card num)/amdgpu_firmware_info
or using amdgpu_query_firmware_version API from libdrm.
parent ad68279d
No related branches found
No related tags found
No related merge requests found
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
FROM golang:1.9.5-alpine3.7
RUN apk --no-cache add git
RUN apk --no-cache add git pkgconfig build-base libdrm-dev
RUN mkdir -p /go/src/github.com/RadeonOpenCompute/k8s-device-plugin
ADD . /go/src/github.com/RadeonOpenCompute/k8s-device-plugin
RUN go install \
......@@ -21,7 +21,7 @@ RUN go install \
FROM alpine:3.7
MAINTAINER Kenny Ho <Kenny.Ho@amd.com>
RUN apk --no-cache add ca-certificates
RUN apk --no-cache add ca-certificates libdrm
WORKDIR /root/
COPY --from=0 /go/bin/k8s-device-plugin .
CMD ["./k8s-device-plugin", "-logtostderr=true", "-stderrthreshold=INFO", "-v=5"]
amdgpu.go 0 → 100644
/**
* Copyright 2018 Advanced Micro Devices, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
// TODO: consider separating this into its own repository/library
package main
// #cgo pkg-config: libdrm libdrm_amdgpu
// #include <stdint.h>
// #include <xf86drm.h>
// #include <drm.h>
// #include <amdgpu.h>
// #include <amdgpu_drm.h>
import "C"
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"github.com/golang/glog"
)
func GetAMDGPUs() map[string]string {
if _, err := os.Stat("/sys/module/amdgpu/drivers/"); err != nil {
glog.Warningf("amdgpu driver unavailable: %s", err)
return make(map[string]string)
}
//ex: /sys/module/amdgpu/drivers/pci:amdgpu/0000:19:00.0
matches, _ := filepath.Glob("/sys/module/amdgpu/drivers/pci:amdgpu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*")
devices := make(map[string]string)
for _, path := range matches {
glog.Info(path)
cardPath, _ := filepath.Glob(path + "/drm/card*")
if len(cardPath) > 0 {
devices[filepath.Base(path)] = filepath.Base(cardPath[0])
}
}
return devices
}
func AMDGPU(cardName string) bool {
sysfsVendorPath := "/sys/class/drm/" + cardName + "/device/vendor"
b, err := ioutil.ReadFile(sysfsVendorPath)
if err == nil {
vid := strings.TrimSpace(string(b))
// AMD vendor ID is 0x1002
if "0x1002" == vid {
return true
}
} else {
glog.Errorf("Error opening %s: %s", sysfsVendorPath, err)
}
return false
}
func openAMDGPU(cardName string) (C.amdgpu_device_handle, error) {
if !AMDGPU(cardName) {
return nil, fmt.Errorf("%s is not an AMD GPU", cardName)
}
devPath := "/dev/dri/" + cardName
dev, err := os.Open(devPath)
if err != nil {
return nil, fmt.Errorf("Fail to open %s: %s", devPath, err)
}
defer dev.Close()
devFd := C.int(dev.Fd())
var devHandle C.amdgpu_device_handle
var major C.uint32_t
var minor C.uint32_t
rc := C.amdgpu_device_initialize(devFd, &major, &minor, &devHandle)
if rc < 0 {
return nil, fmt.Errorf("Fail to initialize %s: %d", devPath, err)
}
glog.Infof("Initialized AMD GPU version: major %d, minor %d", major, minor)
return devHandle, nil
}
func AMDGPUDevFunctional(cardName string) bool {
devHandle, err := openAMDGPU(cardName)
if err != nil {
glog.Errorf("%s", err)
return false
}
defer C.amdgpu_device_deinitialize(devHandle)
return true
}
func AMDGPUGetFirmwareVersions(cardName string) (map[string]uint32, map[string]uint32) {
devHandle, err := openAMDGPU(cardName)
if err != nil {
glog.Errorf("%s", err)
return map[string]uint32{}, map[string]uint32{}
}
defer C.amdgpu_device_deinitialize(devHandle)
var ver C.uint32_t
var feat C.uint32_t
featVersions := map[string]uint32{}
fwVersions := map[string]uint32{}
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_VCE, 0, 0, &ver, &feat)
featVersions["VCE"] = uint32(feat)
fwVersions["VCE"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_UVD, 0, 0, &ver, &feat)
featVersions["UVD"] = uint32(feat)
fwVersions["UVD"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_GMC, 0, 0, &ver, &feat)
featVersions["MC"] = uint32(feat)
fwVersions["MC"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_GFX_ME, 0, 0, &ver, &feat)
featVersions["ME"] = uint32(feat)
fwVersions["ME"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_GFX_PFP, 0, 0, &ver, &feat)
featVersions["PFP"] = uint32(feat)
fwVersions["PFP"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_GFX_CE, 0, 0, &ver, &feat)
featVersions["CE"] = uint32(feat)
fwVersions["CE"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_GFX_RLC, 0, 0, &ver, &feat)
featVersions["RLC"] = uint32(feat)
fwVersions["RLC"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_GFX_MEC, 0, 0, &ver, &feat)
featVersions["MEC"] = uint32(feat)
fwVersions["MEC"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_SMC, 0, 0, &ver, &feat)
featVersions["SMC"] = uint32(feat)
fwVersions["SMC"] = uint32(ver)
C.amdgpu_query_firmware_version(devHandle, C.AMDGPU_INFO_FW_SDMA, 0, 0, &ver, &feat)
featVersions["SDMA0"] = uint32(feat)
fwVersions["SDMA0"] = uint32(ver)
return featVersions, fwVersions
}
var fwVersionRe = regexp.MustCompile(`(\w+) feature version: (\d+), firmware version: (0x[0-9a-fA-F]+)`)
func parseDebugFSFirmwareInfo(path string) (map[string]uint32, map[string]uint32) {
feat := make(map[string]uint32)
fw := make(map[string]uint32)
glog.Info("Parsing " + path)
f, e := os.Open(path)
if e == nil {
scanner := bufio.NewScanner(f)
var v int64
for scanner.Scan() {
m := fwVersionRe.FindStringSubmatch(scanner.Text())
if m != nil {
v, _ = strconv.ParseInt(m[2], 0, 32)
feat[m[1]] = uint32(v)
v, _ = strconv.ParseInt(m[3], 0, 32)
fw[m[1]] = uint32(v)
}
}
} else {
glog.Error("Fail to open " + path)
}
return feat, fw
}
/**
* Copyright 2018 Advanced Micro Devices, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package main
import (
"io/ioutil"
"path/filepath"
"strings"
"testing"
)
func hasAMDGPU(t *testing.T) bool {
devices := GetAMDGPUs()
if len(devices) <= 0 {
return false
} else {
return true
}
}
func TestAMDGPUFirmwareVersionConsistent(t *testing.T) {
if !hasAMDGPU(t) {
t.Skip("Skipping test, no AMD GPU found.")
}
devices := GetAMDGPUs()
for pci, card := range devices {
t.Logf("%s, %s", pci, card)
//debugfs path/interface may not be stable
debugFSfeatVersion, debugFSfwVersion :=
parseDebugFSFirmwareInfo("/sys/kernel/debug/dri/" + card[4:] + "/amdgpu_firmware_info")
featVersion, fwVersion := AMDGPUGetFirmwareVersions(card)
for k, _ := range featVersion {
if featVersion[k] != debugFSfeatVersion[k] {
t.Errorf("%s feature version not consistent: ioctl: %d, debugfs: %d",
k, featVersion[k], debugFSfeatVersion[k])
}
if fwVersion[k] != debugFSfwVersion[k] {
t.Errorf("%s firmware version not consistent: ioctl: %x, debugfs: %x",
k, fwVersion[k], debugFSfwVersion[k])
}
}
}
}
func TestAMDGPUcountConsistent(t *testing.T) {
if !hasAMDGPU(t) {
t.Skip("Skipping test, no AMD GPU found.")
}
devices := GetAMDGPUs()
matches, _ := filepath.Glob("/sys/class/drm/card[0-9]*/device/vendor")
count := 0
for _, vidPath := range matches {
t.Log(vidPath)
b, err := ioutil.ReadFile(vidPath)
vid := string(b)
// AMD vendor ID is 0x1002
if err == nil && "0x1002" == strings.TrimSpace(vid) {
count++
} else {
t.Log(vid)
}
}
if count != len(devices) {
t.Errorf("AMD GPU counts differ: /sys/module/amdgpu: %d, /sys/class/drm: %d", len(devices), count)
}
}
func TestHasAMDGPU(t *testing.T) {
if !hasAMDGPU(t) {
t.Skip("Skipping test, no AMD GPU found.")
}
}
func TestAMDGPUDevFunctional(t *testing.T) {
if !hasAMDGPU(t) {
t.Skip("Skipping test, no AMD GPU found.")
}
devices := GetAMDGPUs()
for _, card := range devices {
ret := AMDGPUDevFunctional(card)
t.Logf("%s functional: %t", card, ret)
}
}
func TestParseDebugFSFirmwareInfo(t *testing.T) {
exp_feat := map[string]uint32{
"VCE": 0,
"UVD": 0,
"MC": 0,
"ME": 35,
"PFP": 35,
"CE": 35,
"RLC": 0,
"MEC": 33,
"MEC2": 33,
"SOS": 0,
"ASD": 0,
"SMC": 0,
"SDMA0": 40,
"SDMA1": 40,
}
exp_fw := map[string]uint32{
"VCE": 0x352d0400,
"UVD": 0x01571100,
"MC": 0x00000000,
"ME": 0x00000094,
"PFP": 0x000000a4,
"CE": 0x0000004a,
"RLC": 0x00000058,
"MEC": 0x00000160,
"MEC2": 0x00000160,
"SOS": 0x00161a92,
"ASD": 0x0016129a,
"SMC": 0x001c2800,
"SDMA0": 0x00000197,
"SDMA1": 0x00000197,
}
feat, fw := parseDebugFSFirmwareInfo("testdata/debugfs-parsing/amdgpu_firmware_info")
for k := range exp_feat {
val, ok := feat[k]
if !ok || val != exp_feat[k] {
t.Errorf("Error parsing feature version for %s: expect %d", k, exp_feat[k])
}
}
for k := range exp_fw {
val, ok := fw[k]
if !ok || val != exp_fw[k] {
t.Errorf("Error parsing firmware version for %s: expect %#08x", k, exp_fw[k])
}
}
if len(feat) != len(exp_feat) || len(fw) != len(exp_fw) {
t.Errorf("Incorrect parsing of amdgpu firmware info from debugfs")
}
}
......@@ -49,7 +49,7 @@ func (p *Plugin) Stop() error {
var topoSIMDre = regexp.MustCompile(`simd_count\s(\d+)`)
func countGPUDev(topoRootParam ...string) int {
func countGPUDevFromTopology(topoRootParam ...string) int {
topoRoot := "/sys/class/kfd/kfd"
if len(topoRootParam) == 1 {
topoRoot = topoRootParam[0]
......@@ -108,7 +108,7 @@ func (p *Plugin) PreStartContainer(ctx context.Context, r *pluginapi.PreStartCon
// Monitors available amdgpu devices and notifies Kubernetes
func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
devCount := countGPUDev()
devCount := countGPUDevFromTopology()
devs := make([]*pluginapi.Device, devCount)
......
......@@ -13,14 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package main
import (
"testing"
)
func TestCountGPUDev(t *testing.T) {
count := countGPUDev("testdata/topology-parsing")
func TestCountGPUDevFromTopology(t *testing.T) {
count := countGPUDevFromTopology("testdata/topology-parsing")
expCount := 2
if count != expCount {
......
VCE feature version: 0, firmware version: 0x352d0400
UVD feature version: 0, firmware version: 0x01571100
MC feature version: 0, firmware version: 0x00000000
ME feature version: 35, firmware version: 0x00000094
PFP feature version: 35, firmware version: 0x000000a4
CE feature version: 35, firmware version: 0x0000004a
RLC feature version: 0, firmware version: 0x00000058
MEC feature version: 33, firmware version: 0x00000160
MEC2 feature version: 33, firmware version: 0x00000160
SOS feature version: 0, firmware version: 0x00161a92
ASD feature version: 0, firmware version: 0x0016129a
SMC feature version: 0, firmware version: 0x001c2800
SDMA0 feature version: 40, firmware version: 0x00000197
SDMA1 feature version: 40, firmware version: 0x00000197
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment