First draft of the NVIDIA Docker plugin

Leverage the Docker volume plugin mechanism introduced with Docker 1.9 This plugin also exports few REST endpoints to ease remote NVIDIA Docker management This should address issue #8
NVIDIA · Dec 6, 2015 · 9c679c7 · 9c679c7
1 parent b676d0a
commit 9c679c7
Show file tree

Hide file tree

Showing 16 changed files with 1,681 additions and 0 deletions.
diff --git a/plugin/Dockerfile.build b/plugin/Dockerfile.build
@@ -0,0 +1,34 @@
+# Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+
+FROM golang
+
+ENV NVIDIA_GPGKEY_SUM bd841d59a27a406e513db7d405550894188a4c1cd96bf8aa4f82f1b39e0b5c1c
+ENV NVIDIA_GPGKEY_FPR 889bee522da690103c4b085ed88c3d385c37d3be
+ENV NVIDIA_GDK_SUM 1e32e58f69fe29ee67b845233e7aa9347f37994463252bccbc8bfc8a7104ab5a
+
+RUN apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/GPGKEY && \
+    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +2 > cudasign.pub && \
+    echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
+    echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64 /" > /etc/apt/sources.list.d/cuda.list
+
+RUN apt-get update && apt-get install -y --no-install-recommends --force-yes \
+    cuda-cudart-dev-6-5=6.5-19 cuda-misc-headers-6-5=6.5-19 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN objcopy --redefine-sym memcpy=memcpy@GLIBC_2.2.5 /usr/local/cuda-6.5/lib64/libcudart_static.a
+
+RUN wget -O gdk.run -q http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda_352_39_gdk_linux.run && \
+    echo "$NVIDIA_GDK_SUM gdk.run" | sha256sum -c --strict - && \
+    chmod +x gdk.run && ./gdk.run --silent && rm gdk.run
+
+COPY src /go/src
+VOLUME /go/bin
+
+ENV CGO_CFLAGS "-I /usr/local/cuda-6.5/include -I /usr/include/nvidia/gdk"
+ENV CGO_LDFLAGS "-L /usr/local/cuda-6.5/lib64 -L /usr/src/gdk/nvml/lib -ldl -lrt"
+
+ARG UID
+RUN useradd --uid $UID build
+USER build
+
+CMD go get -v -ldflags="-s" plugin
diff --git a/plugin/Makefile b/plugin/Makefile
@@ -0,0 +1,26 @@
+# Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+
+MAKE_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
+BIN_DIR  := $(MAKE_DIR)/bin
+USER_ID  := $(shell id -u)
+
+IMAGE  := nvdocker-build
+PREFIX := /usr/local/nvidia
+TARGET := nvidia-docker-plugin
+PLUGIN := $(BIN_DIR)/plugin
+
+.PHONY: all install clean
+
+all : $(PLUGIN)
+
+$(PLUGIN) :
+	@docker build --build-arg UID=$(USER_ID) -t $(IMAGE) -f Dockerfile.build $(MAKE_DIR)
+	@mkdir -p $(BIN_DIR)
+	@docker run --rm -v $(BIN_DIR):/go/bin $(IMAGE)
+
+install: all
+	install -D -T -m 755 $(PLUGIN) $(PREFIX)/$(TARGET)
+
+clean :
+	-@docker rmi -f $(IMAGE) golang 2> /dev/null
+	@rm -rf $(BIN_DIR)
diff --git a/plugin/src/cuda/cuda.go b/plugin/src/cuda/cuda.go
@@ -0,0 +1,118 @@
+// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+
+package cuda
+
+// #cgo LDFLAGS: -lcudart_static
+// #include <stdlib.h>
+// #include <cuda_runtime_api.h>
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+type MemoryInfo struct {
+	ECC       bool
+	Global    uint
+	Shared    uint // includes L1 cache
+	Constant  uint
+	L2Cache   uint
+	Bandwidth uint
+}
+
+type Device struct {
+	handle C.int
+
+	Gen    string
+	Arch   string
+	Cores  uint
+	Memory MemoryInfo
+}
+
+func cudaErr(ret C.cudaError_t) error {
+	if ret == C.cudaSuccess {
+		return nil
+	}
+	err := C.GoString(C.cudaGetErrorString(ret))
+	return errors.New(err)
+}
+
+var archToGen = map[string]string{
+	"1": "Tesla",
+	"2": "Fermi",
+	"3": "Kepler",
+	"5": "Maxwell",
+}
+
+var archToCoresPerSM = map[string]uint{
+	"1.0": 8,   // Tesla Generation (SM 1.0) G80 class
+	"1.1": 8,   // Tesla Generation (SM 1.1) G8x G9x class
+	"1.2": 8,   // Tesla Generation (SM 1.2) GT21x class
+	"1.3": 8,   // Tesla Generation (SM 1.3) GT20x class
+	"2.0": 32,  // Fermi Generation (SM 2.0) GF100 GF110 class
+	"2.1": 48,  // Fermi Generation (SM 2.1) GF10x GF11x class
+	"3.0": 192, // Kepler Generation (SM 3.0) GK10x class
+	"3.2": 192, // Kepler Generation (SM 3.2) TK1 class
+	"3.5": 192, // Kepler Generation (SM 3.5) GK11x GK20x class
+	"3.7": 192, // Kepler Generation (SM 3.7) GK21x class
+	"5.0": 128, // Maxwell Generation (SM 5.0) GM10x class
+	"5.2": 128, // Maxwell Generation (SM 5.2) GM20x class
+	"5.3": 128, // Maxwell Generation (SM 5.3) TX1 class
+}
+
+func GetDriverVersion() (string, error) {
+	var driver C.int
+
+	err := cudaErr(C.cudaDriverGetVersion(&driver))
+	d := fmt.Sprintf("%d.%d", int(driver)/1000, int(driver)%100/10)
+	return d, err
+}
+
+func NewDevice(busID string) (*Device, error) {
+	var (
+		dev  C.int
+		prop C.struct_cudaDeviceProp
+	)
+
+	id := C.CString(busID)
+	if err := cudaErr(C.cudaDeviceGetByPCIBusId(&dev, id)); err != nil {
+		return nil, err
+	}
+	C.free(unsafe.Pointer(id))
+
+	if err := cudaErr(C.cudaGetDeviceProperties(&prop, dev)); err != nil {
+		return nil, err
+	}
+	arch := fmt.Sprintf("%d.%d", prop.major, prop.minor)
+	cores, ok := archToCoresPerSM[arch]
+	if !ok {
+		return nil, fmt.Errorf("unsupported CUDA arch: %s", arch)
+	}
+
+	// Destroy the active CUDA context
+	cudaErr(C.cudaDeviceReset())
+
+	return &Device{
+		handle: dev,
+		Gen:    archToGen[arch[:1]],
+		Arch:   arch,
+		Cores:  cores * uint(prop.multiProcessorCount),
+		Memory: MemoryInfo{
+			ECC:       bool(prop.ECCEnabled != 0),
+			Global:    uint(prop.totalGlobalMem / (1024 * 1024)),
+			Shared:    uint(prop.sharedMemPerMultiprocessor / 1024),
+			Constant:  uint(prop.totalConstMem / 1024),
+			L2Cache:   uint(prop.l2CacheSize / 1024),
+			Bandwidth: 2 * uint((prop.memoryClockRate/1000)*(prop.memoryBusWidth/8)) / 1000,
+		},
+	}, nil
+}
+
+func CanAccessPeer(dev1, dev2 *Device) (bool, error) {
+	var ok C.int
+
+	err := cudaErr(C.cudaDeviceCanAccessPeer(&ok, dev1.handle, dev2.handle))
+	return (ok != 0), err
+}
diff --git a/plugin/src/graceful/graceful.go b/plugin/src/graceful/graceful.go
@@ -0,0 +1,100 @@
+// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+
+package graceful
+
+import (
+	"net"
+	"net/http"
+	"sync"
+	"time"
+
+	middleware "github.com/justinas/alice"
+	"gopkg.in/tylerb/graceful.v1"
+)
+
+const timeout = 5 * time.Second
+
+type HTTPServer struct {
+	sync.Mutex
+
+	network string
+	router  *http.ServeMux
+	server  *graceful.Server
+	err     error
+}
+
+func recovery(handler http.Handler) http.Handler {
+	f := func(w http.ResponseWriter, r *http.Request) {
+		defer func() {
+			if recover() != nil {
+				w.WriteHeader(http.StatusInternalServerError)
+			}
+		}()
+		handler.ServeHTTP(w, r)
+	}
+	return http.HandlerFunc(f)
+}
+
+func NewHTTPServer(net, addr string, mw ...middleware.Constructor) *HTTPServer {
+	r := http.NewServeMux()
+
+	return &HTTPServer{
+		network: net,
+		router:  r,
+		server: &graceful.Server{
+			Timeout: timeout,
+			Server: &http.Server{
+				Addr:         addr,
+				Handler:      middleware.New(recovery).Append(mw...).Then(r),
+				ReadTimeout:  timeout,
+				WriteTimeout: timeout,
+			},
+		},
+	}
+}
+
+func (s *HTTPServer) Handle(method, route string, handler http.HandlerFunc) {
+	f := func(w http.ResponseWriter, r *http.Request) {
+		if r.Method != method {
+			http.NotFound(w, r)
+			return
+		}
+		handler.ServeHTTP(w, r)
+	}
+	s.router.HandleFunc(route, f)
+}
+
+func (s *HTTPServer) Serve() <-chan struct{} {
+	l, err := net.Listen(s.network, s.server.Addr)
+	if err != nil {
+		s.Lock()
+		s.err = err
+		s.Unlock()
+		c := make(chan struct{})
+		close(c)
+		return c
+	}
+
+	c := s.server.StopChan()
+	go func() {
+		s.Lock()
+		defer s.Unlock()
+
+		err = s.server.Serve(l)
+		if e, ok := err.(*net.OpError); !ok || (ok && e.Op != "accept") {
+			s.err = err
+		}
+	}()
+	return c
+}
+
+func (s *HTTPServer) Stop() {
+	s.server.Stop(timeout)
+}
+
+func (s *HTTPServer) Error() error {
+	s.Lock()
+	defer s.Unlock()
+
+	return s.err
+}