Skip to content
This repository has been archived by the owner on Jan 22, 2024. It is now read-only.

Commit

Permalink
First draft of the NVIDIA Docker plugin
Browse files Browse the repository at this point in the history
Leverage the Docker volume plugin mechanism introduced with Docker 1.9
This plugin also exports few REST endpoints to ease remote NVIDIA Docker management
This should address issue #8
  • Loading branch information
3XX0 committed Dec 6, 2015
1 parent b676d0a commit 9c679c7
Show file tree
Hide file tree
Showing 16 changed files with 1,681 additions and 0 deletions.
34 changes: 34 additions & 0 deletions plugin/Dockerfile.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.

FROM golang

ENV NVIDIA_GPGKEY_SUM bd841d59a27a406e513db7d405550894188a4c1cd96bf8aa4f82f1b39e0b5c1c
ENV NVIDIA_GPGKEY_FPR 889bee522da690103c4b085ed88c3d385c37d3be
ENV NVIDIA_GDK_SUM 1e32e58f69fe29ee67b845233e7aa9347f37994463252bccbc8bfc8a7104ab5a

RUN apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/GPGKEY && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +2 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64 /" > /etc/apt/sources.list.d/cuda.list

RUN apt-get update && apt-get install -y --no-install-recommends --force-yes \
cuda-cudart-dev-6-5=6.5-19 cuda-misc-headers-6-5=6.5-19 \
&& rm -rf /var/lib/apt/lists/*

RUN objcopy --redefine-sym memcpy=memcpy@GLIBC_2.2.5 /usr/local/cuda-6.5/lib64/libcudart_static.a

RUN wget -O gdk.run -q http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda_352_39_gdk_linux.run && \
echo "$NVIDIA_GDK_SUM gdk.run" | sha256sum -c --strict - && \
chmod +x gdk.run && ./gdk.run --silent && rm gdk.run

COPY src /go/src
VOLUME /go/bin

ENV CGO_CFLAGS "-I /usr/local/cuda-6.5/include -I /usr/include/nvidia/gdk"
ENV CGO_LDFLAGS "-L /usr/local/cuda-6.5/lib64 -L /usr/src/gdk/nvml/lib -ldl -lrt"

ARG UID
RUN useradd --uid $UID build
USER build

CMD go get -v -ldflags="-s" plugin
26 changes: 26 additions & 0 deletions plugin/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.

MAKE_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
BIN_DIR := $(MAKE_DIR)/bin
USER_ID := $(shell id -u)

IMAGE := nvdocker-build
PREFIX := /usr/local/nvidia
TARGET := nvidia-docker-plugin
PLUGIN := $(BIN_DIR)/plugin

.PHONY: all install clean

all : $(PLUGIN)

$(PLUGIN) :
@docker build --build-arg UID=$(USER_ID) -t $(IMAGE) -f Dockerfile.build $(MAKE_DIR)
@mkdir -p $(BIN_DIR)
@docker run --rm -v $(BIN_DIR):/go/bin $(IMAGE)

install: all
install -D -T -m 755 $(PLUGIN) $(PREFIX)/$(TARGET)

clean :
-@docker rmi -f $(IMAGE) golang 2> /dev/null
@rm -rf $(BIN_DIR)
118 changes: 118 additions & 0 deletions plugin/src/cuda/cuda.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.

package cuda

// #cgo LDFLAGS: -lcudart_static
// #include <stdlib.h>
// #include <cuda_runtime_api.h>
import "C"

import (
"errors"
"fmt"
"unsafe"
)

type MemoryInfo struct {
ECC bool
Global uint
Shared uint // includes L1 cache
Constant uint
L2Cache uint
Bandwidth uint
}

type Device struct {
handle C.int

Gen string
Arch string
Cores uint
Memory MemoryInfo
}

func cudaErr(ret C.cudaError_t) error {
if ret == C.cudaSuccess {
return nil
}
err := C.GoString(C.cudaGetErrorString(ret))
return errors.New(err)
}

var archToGen = map[string]string{
"1": "Tesla",
"2": "Fermi",
"3": "Kepler",
"5": "Maxwell",
}

var archToCoresPerSM = map[string]uint{
"1.0": 8, // Tesla Generation (SM 1.0) G80 class
"1.1": 8, // Tesla Generation (SM 1.1) G8x G9x class
"1.2": 8, // Tesla Generation (SM 1.2) GT21x class
"1.3": 8, // Tesla Generation (SM 1.3) GT20x class
"2.0": 32, // Fermi Generation (SM 2.0) GF100 GF110 class
"2.1": 48, // Fermi Generation (SM 2.1) GF10x GF11x class
"3.0": 192, // Kepler Generation (SM 3.0) GK10x class
"3.2": 192, // Kepler Generation (SM 3.2) TK1 class
"3.5": 192, // Kepler Generation (SM 3.5) GK11x GK20x class
"3.7": 192, // Kepler Generation (SM 3.7) GK21x class
"5.0": 128, // Maxwell Generation (SM 5.0) GM10x class
"5.2": 128, // Maxwell Generation (SM 5.2) GM20x class
"5.3": 128, // Maxwell Generation (SM 5.3) TX1 class
}

func GetDriverVersion() (string, error) {
var driver C.int

err := cudaErr(C.cudaDriverGetVersion(&driver))
d := fmt.Sprintf("%d.%d", int(driver)/1000, int(driver)%100/10)
return d, err
}

func NewDevice(busID string) (*Device, error) {
var (
dev C.int
prop C.struct_cudaDeviceProp
)

id := C.CString(busID)
if err := cudaErr(C.cudaDeviceGetByPCIBusId(&dev, id)); err != nil {
return nil, err
}
C.free(unsafe.Pointer(id))

if err := cudaErr(C.cudaGetDeviceProperties(&prop, dev)); err != nil {
return nil, err
}
arch := fmt.Sprintf("%d.%d", prop.major, prop.minor)
cores, ok := archToCoresPerSM[arch]
if !ok {
return nil, fmt.Errorf("unsupported CUDA arch: %s", arch)
}

// Destroy the active CUDA context
cudaErr(C.cudaDeviceReset())

return &Device{
handle: dev,
Gen: archToGen[arch[:1]],
Arch: arch,
Cores: cores * uint(prop.multiProcessorCount),
Memory: MemoryInfo{
ECC: bool(prop.ECCEnabled != 0),
Global: uint(prop.totalGlobalMem / (1024 * 1024)),
Shared: uint(prop.sharedMemPerMultiprocessor / 1024),
Constant: uint(prop.totalConstMem / 1024),
L2Cache: uint(prop.l2CacheSize / 1024),
Bandwidth: 2 * uint((prop.memoryClockRate/1000)*(prop.memoryBusWidth/8)) / 1000,
},
}, nil
}

func CanAccessPeer(dev1, dev2 *Device) (bool, error) {
var ok C.int

err := cudaErr(C.cudaDeviceCanAccessPeer(&ok, dev1.handle, dev2.handle))
return (ok != 0), err
}
100 changes: 100 additions & 0 deletions plugin/src/graceful/graceful.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.

package graceful

import (
"net"
"net/http"
"sync"
"time"

middleware "github.com/justinas/alice"
"gopkg.in/tylerb/graceful.v1"
)

const timeout = 5 * time.Second

type HTTPServer struct {
sync.Mutex

network string
router *http.ServeMux
server *graceful.Server
err error
}

func recovery(handler http.Handler) http.Handler {
f := func(w http.ResponseWriter, r *http.Request) {
defer func() {
if recover() != nil {
w.WriteHeader(http.StatusInternalServerError)
}
}()
handler.ServeHTTP(w, r)
}
return http.HandlerFunc(f)
}

func NewHTTPServer(net, addr string, mw ...middleware.Constructor) *HTTPServer {
r := http.NewServeMux()

return &HTTPServer{
network: net,
router: r,
server: &graceful.Server{
Timeout: timeout,
Server: &http.Server{
Addr: addr,
Handler: middleware.New(recovery).Append(mw...).Then(r),
ReadTimeout: timeout,
WriteTimeout: timeout,
},
},
}
}

func (s *HTTPServer) Handle(method, route string, handler http.HandlerFunc) {
f := func(w http.ResponseWriter, r *http.Request) {
if r.Method != method {
http.NotFound(w, r)
return
}
handler.ServeHTTP(w, r)
}
s.router.HandleFunc(route, f)
}

func (s *HTTPServer) Serve() <-chan struct{} {
l, err := net.Listen(s.network, s.server.Addr)
if err != nil {
s.Lock()
s.err = err
s.Unlock()
c := make(chan struct{})
close(c)
return c
}

c := s.server.StopChan()
go func() {
s.Lock()
defer s.Unlock()

err = s.server.Serve(l)
if e, ok := err.(*net.OpError); !ok || (ok && e.Op != "accept") {
s.err = err
}
}()
return c
}

func (s *HTTPServer) Stop() {
s.server.Stop(timeout)
}

func (s *HTTPServer) Error() error {
s.Lock()
defer s.Unlock()

return s.err
}
Loading

0 comments on commit 9c679c7

Please sign in to comment.