xmtop/devices/nvidia.go

180 lines
5.7 KiB
Go

package devices
import (
"bytes"
"encoding/csv"
"errors"
"fmt"
"os/exec"
"strconv"
"sync"
"time"
)
// Set up variables and register this plug-in with the main code.
// The functions Register*(f) tell gotop which of these plugin functions to
// call to update data; the RegisterStartup() function sets the function
// that gotop will call when everything else has been done and the plugin
// should start collecting data.
//
// In this plugin, one call to the nvidia program returns *all* the data
// we're looking for, but gotop will call each update function during each
// cycle. This means that the nvidia program would be called 3 (or more)
// times per update, which isn't very efficient. Therefore, we make this
// code more complex to run a job in the background that runs the nvidia
// tool periodically and puts the results into hashes; the update functions
// then just sync data from those hashes into the return data.
func init() {
RegisterStartup(startNVidia)
}
// updateNvidiaTemp copies data from the local _temps cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaTemp(temps map[string]int) map[string]error {
nvidiaLock.Lock()
defer nvidiaLock.Unlock()
for k, v := range _temps {
temps[k] = v
}
return _errors
}
// updateNvidiaMem copies data from the local _mems cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaMem(mems map[string]MemoryInfo) map[string]error {
nvidiaLock.Lock()
defer nvidiaLock.Unlock()
for k, v := range _mems {
mems[k] = v
}
return _errors
}
// updateNvidiaUsage copies data from the local _cpus cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error {
nvidiaLock.Lock()
defer nvidiaLock.Unlock()
for k, v := range _cpus {
cpus[k] = v
}
return _errors
}
// startNVidia is called once by gotop, and forks a thread to call the nvidia
// tool periodically and update the cached cpu, memory, and temperature
// values that are used by the update*() functions to return data to gotop.
//
// The vars argument contains command-line arguments to allow the plugin
// to change runtime options; the only option currently supported is the
// `nvidia-refresh` arg, which is expected to be a time.Duration value and
// sets how frequently the nvidia tool is called to refresh the date.
func startNVidia(vars map[string]string) error {
if vars["nvidia"] != "true" {
return nil
}
_, err := exec.Command("nvidia-smi", "-L").Output()
if err != nil {
return errors.New(fmt.Sprintf("NVidia GPU error: %s", err))
}
_errors = make(map[string]error)
_temps = make(map[string]int)
_mems = make(map[string]MemoryInfo)
_cpus = make(map[string]int)
_errors = make(map[string]error)
RegisterTemp(updateNvidiaTemp)
RegisterMem(updateNvidiaMem)
RegisterCPU(updateNvidiaUsage)
nvidiaLock = sync.Mutex{}
// Get the refresh period from the passed-in command-line/config
// file options
refresh := time.Second
if v, ok := vars["nvidia-refresh"]; ok {
if refresh, err = time.ParseDuration(v); err != nil {
return err
}
}
// update once to populate the device names, for the widgets.
updateNvidia()
// Fork off a long-running job to call the nvidia tool periodically,
// parse out the values, and put them in the cache.
go func() {
timer := time.Tick(refresh)
for range timer {
updateNvidia()
}
}()
return nil
}
// Caches for the output from the nvidia tool; the update() functions pull
// from these and return the values to gotop when requested.
var (
_temps map[string]int
_mems map[string]MemoryInfo
_cpus map[string]int
// A cache of errors generated by the background job running the nvidia tool;
// these errors are returned to gotop when it calls the update() functions.
_errors map[string]error
)
var nvidiaLock sync.Mutex
// updateNvidia calls the nvidia tool, parses the output, and caches the results
// in the various _* maps. The metric data parsed is: name, index,
// temperature.gpu, utilization.gpu, utilization.memory, memory.total,
// memory.free, memory.used
//
// If this function encounters an error calling `nvidia-smi`, it caches the
// error and returns immediately. We expect exec errors only when the tool
// isn't available, or when it fails for some reason; no exec error cases
// are recoverable. This does **not** stop the cache job; that will continue
// to run and continue to call updateNvidia().
func updateNvidia() {
bs, err := exec.Command(
"nvidia-smi",
"--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used",
"--format=csv,noheader,nounits").Output()
if err != nil {
_errors["nvidia"] = err
//bs = []byte("GeForce GTX 1080 Ti, 0, 31, 9, 11175, 206")
return
}
csvReader := csv.NewReader(bytes.NewReader(bs))
csvReader.TrimLeadingSpace = true
records, err := csvReader.ReadAll()
if err != nil {
_errors["nvidia"] = err
return
}
// Ensure we're not trying to modify the caches while they're being read by the update() functions.
nvidiaLock.Lock()
defer nvidiaLock.Unlock()
// Errors during parsing are recorded, but do not stop parsing.
for _, row := range records {
// The name of the devices is the nvidia-smi "<name>.<index>"
name := row[0] + "." + row[1]
if _temps[name], err = strconv.Atoi(row[2]); err != nil {
_errors[name] = err
}
if _cpus[name], err = strconv.Atoi(row[3]); err != nil {
_errors[name] = err
}
t, err := strconv.Atoi(row[4])
if err != nil {
_errors[name] = err
}
u, err := strconv.Atoi(row[5])
if err != nil {
_errors[name] = err
}
_mems[name] = MemoryInfo{
Total: 1048576 * uint64(t),
Used: 1048576 * uint64(u),
UsedPercent: (float64(u) / float64(t)) * 100.0,
}
}
}