180 lines
5.7 KiB
Go
180 lines
5.7 KiB
Go
package devices
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/csv"
|
|
"errors"
|
|
"fmt"
|
|
"os/exec"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// Set up variables and register this plug-in with the main code.
|
|
// The functions Register*(f) tell gotop which of these plugin functions to
|
|
// call to update data; the RegisterStartup() function sets the function
|
|
// that gotop will call when everything else has been done and the plugin
|
|
// should start collecting data.
|
|
//
|
|
// In this plugin, one call to the nvidia program returns *all* the data
|
|
// we're looking for, but gotop will call each update function during each
|
|
// cycle. This means that the nvidia program would be called 3 (or more)
|
|
// times per update, which isn't very efficient. Therefore, we make this
|
|
// code more complex to run a job in the background that runs the nvidia
|
|
// tool periodically and puts the results into hashes; the update functions
|
|
// then just sync data from those hashes into the return data.
|
|
func init() {
|
|
RegisterStartup(startNVidia)
|
|
}
|
|
|
|
// updateNvidiaTemp copies data from the local _temps cache into the passed-in
|
|
// return-value map. It is called once per cycle by gotop.
|
|
func updateNvidiaTemp(temps map[string]int) map[string]error {
|
|
nvidiaLock.Lock()
|
|
defer nvidiaLock.Unlock()
|
|
for k, v := range _temps {
|
|
temps[k] = v
|
|
}
|
|
return _errors
|
|
}
|
|
|
|
// updateNvidiaMem copies data from the local _mems cache into the passed-in
|
|
// return-value map. It is called once per cycle by gotop.
|
|
func updateNvidiaMem(mems map[string]MemoryInfo) map[string]error {
|
|
nvidiaLock.Lock()
|
|
defer nvidiaLock.Unlock()
|
|
for k, v := range _mems {
|
|
mems[k] = v
|
|
}
|
|
return _errors
|
|
}
|
|
|
|
// updateNvidiaUsage copies data from the local _cpus cache into the passed-in
|
|
// return-value map. It is called once per cycle by gotop.
|
|
func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error {
|
|
nvidiaLock.Lock()
|
|
defer nvidiaLock.Unlock()
|
|
for k, v := range _cpus {
|
|
cpus[k] = v
|
|
}
|
|
return _errors
|
|
}
|
|
|
|
// startNVidia is called once by gotop, and forks a thread to call the nvidia
|
|
// tool periodically and update the cached cpu, memory, and temperature
|
|
// values that are used by the update*() functions to return data to gotop.
|
|
//
|
|
// The vars argument contains command-line arguments to allow the plugin
|
|
// to change runtime options; the only option currently supported is the
|
|
// `nvidia-refresh` arg, which is expected to be a time.Duration value and
|
|
// sets how frequently the nvidia tool is called to refresh the date.
|
|
func startNVidia(vars map[string]string) error {
|
|
if vars["nvidia"] != "true" {
|
|
return nil
|
|
}
|
|
_, err := exec.Command("nvidia-smi", "-L").Output()
|
|
if err != nil {
|
|
return errors.New(fmt.Sprintf("NVidia GPU error: %s", err))
|
|
}
|
|
_errors = make(map[string]error)
|
|
_temps = make(map[string]int)
|
|
_mems = make(map[string]MemoryInfo)
|
|
_cpus = make(map[string]int)
|
|
_errors = make(map[string]error)
|
|
RegisterTemp(updateNvidiaTemp)
|
|
RegisterMem(updateNvidiaMem)
|
|
RegisterCPU(updateNvidiaUsage)
|
|
|
|
nvidiaLock = sync.Mutex{}
|
|
// Get the refresh period from the passed-in command-line/config
|
|
// file options
|
|
refresh := time.Second
|
|
if v, ok := vars["nvidia-refresh"]; ok {
|
|
if refresh, err = time.ParseDuration(v); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
// update once to populate the device names, for the widgets.
|
|
updateNvidia()
|
|
// Fork off a long-running job to call the nvidia tool periodically,
|
|
// parse out the values, and put them in the cache.
|
|
go func() {
|
|
timer := time.Tick(refresh)
|
|
for range timer {
|
|
updateNvidia()
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
// Caches for the output from the nvidia tool; the update() functions pull
|
|
// from these and return the values to gotop when requested.
|
|
var (
|
|
_temps map[string]int
|
|
_mems map[string]MemoryInfo
|
|
_cpus map[string]int
|
|
// A cache of errors generated by the background job running the nvidia tool;
|
|
// these errors are returned to gotop when it calls the update() functions.
|
|
_errors map[string]error
|
|
)
|
|
|
|
var nvidiaLock sync.Mutex
|
|
|
|
// updateNvidia calls the nvidia tool, parses the output, and caches the results
|
|
// in the various _* maps. The metric data parsed is: name, index,
|
|
// temperature.gpu, utilization.gpu, utilization.memory, memory.total,
|
|
// memory.free, memory.used
|
|
//
|
|
// If this function encounters an error calling `nvidia-smi`, it caches the
|
|
// error and returns immediately. We expect exec errors only when the tool
|
|
// isn't available, or when it fails for some reason; no exec error cases
|
|
// are recoverable. This does **not** stop the cache job; that will continue
|
|
// to run and continue to call updateNvidia().
|
|
func updateNvidia() {
|
|
bs, err := exec.Command(
|
|
"nvidia-smi",
|
|
"--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used",
|
|
"--format=csv,noheader,nounits").Output()
|
|
if err != nil {
|
|
_errors["nvidia"] = err
|
|
//bs = []byte("GeForce GTX 1080 Ti, 0, 31, 9, 11175, 206")
|
|
return
|
|
}
|
|
csvReader := csv.NewReader(bytes.NewReader(bs))
|
|
csvReader.TrimLeadingSpace = true
|
|
records, err := csvReader.ReadAll()
|
|
if err != nil {
|
|
_errors["nvidia"] = err
|
|
return
|
|
}
|
|
|
|
// Ensure we're not trying to modify the caches while they're being read by the update() functions.
|
|
nvidiaLock.Lock()
|
|
defer nvidiaLock.Unlock()
|
|
// Errors during parsing are recorded, but do not stop parsing.
|
|
for _, row := range records {
|
|
// The name of the devices is the nvidia-smi "<name>.<index>"
|
|
name := row[0] + "." + row[1]
|
|
if _temps[name], err = strconv.Atoi(row[2]); err != nil {
|
|
_errors[name] = err
|
|
}
|
|
if _cpus[name], err = strconv.Atoi(row[3]); err != nil {
|
|
_errors[name] = err
|
|
}
|
|
t, err := strconv.Atoi(row[4])
|
|
if err != nil {
|
|
_errors[name] = err
|
|
}
|
|
u, err := strconv.Atoi(row[5])
|
|
if err != nil {
|
|
_errors[name] = err
|
|
}
|
|
_mems[name] = MemoryInfo{
|
|
Total: 1048576 * uint64(t),
|
|
Used: 1048576 * uint64(u),
|
|
UsedPercent: (float64(u) / float64(t)) * 100.0,
|
|
}
|
|
}
|
|
}
|