diff --git a/lib/ljsyscall/.dockerignore b/lib/ljsyscall/.dockerignore new file mode 100644 index 0000000000..fd1d8943f6 --- /dev/null +++ b/lib/ljsyscall/.dockerignore @@ -0,0 +1,12 @@ +.* +*.md +COPYRIGHT +ChangeLog +Dockerfile +INSTALL +doc +*.yml +examples +include +rockspec +test diff --git a/lib/ljsyscall/.gitignore b/lib/ljsyscall/.gitignore index ea3ef8819f..a6e4c5785e 100644 --- a/lib/ljsyscall/.gitignore +++ b/lib/ljsyscall/.gitignore @@ -4,3 +4,12 @@ tmp/* *.core ktrace.out obj/* + +/5.1-ljsyscall +/debian/debhelper-build-stamp +/debian/files +/debian/lua-ljsyscall* +/debian/lua_versions +/debian/tmp +/debian/trash + diff --git a/lib/ljsyscall/.travis.yml b/lib/ljsyscall/.travis.yml index 6c90dc6e7c..e474901663 100644 --- a/lib/ljsyscall/.travis.yml +++ b/lib/ljsyscall/.travis.yml @@ -1,11 +1,15 @@ language: c +sudo: required +dist: trusty + +addons: + apt: + packages: + - luajit + - luarocks + - strace before_install: - - sudo add-apt-repository ppa:mwild1/ppa -y - - sudo apt-get update -y - - sudo apt-get install luajit -y --force-yes - - sudo apt-get install luarocks -y - - sudo apt-get install strace -y - git submodule update --init --recursive env: diff --git a/lib/ljsyscall/COPYRIGHT b/lib/ljsyscall/COPYRIGHT index 9f187c3513..2f9256587c 100644 --- a/lib/ljsyscall/COPYRIGHT +++ b/lib/ljsyscall/COPYRIGHT @@ -6,7 +6,7 @@ Files under the include directory include their own copyright information. ljsyscall: System call interface for LuaJIT -Copyright (C) 2011-2014 Justin Cormack. All rights reserved. +Copyright (C) 2011-2016 Justin Cormack. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/lib/ljsyscall/ChangeLog b/lib/ljsyscall/ChangeLog index 0442771dfe..c56e549bb8 100644 --- a/lib/ljsyscall/ChangeLog +++ b/lib/ljsyscall/ChangeLog @@ -1,10 +1,19 @@ - unreleased + 0.12 release + ++ Fix seccomp on arm64 ++ Linux added support for eBPF ++ bug fixes + + 0.11 release + + OSX time functions + OSX Mach types ++ OSX fixes for Yosemite + arm64 support -+ OpenBSD 5.6 and 5.7 support ++ OpenBSD 5.6, 5.7 and 5.8 support + ppc64le support, by Gustavo Serra Scalet + mipsel support ++ added Dockerfile, now available on Docker Hub 0.10 release diff --git a/lib/ljsyscall/Dockerfile b/lib/ljsyscall/Dockerfile new file mode 100644 index 0000000000..50bfcdfd9b --- /dev/null +++ b/lib/ljsyscall/Dockerfile @@ -0,0 +1,4 @@ +FROM alpine:3.4 +RUN apk update && apk add luajit luajit-dev strace && mkdir -p /usr/share/lua/5.1 +COPY . /usr/share/lua/5.1/ +ENTRYPOINT ["luajit"] diff --git a/lib/ljsyscall/README.md b/lib/ljsyscall/README.md index a96203a40d..bcb016820e 100644 --- a/lib/ljsyscall/README.md +++ b/lib/ljsyscall/README.md @@ -16,13 +16,15 @@ The [video of my FOSDEM 2013 talk](http://www.myriabit.com/ljsyscall/) here, and ## Install +A Docker hub automated build (currently only for Linux) is available via `docker pull justincormack/ljsyscall`. You can run the test suite with `docker run justincormack/ljsyscall test/test.lua`, use in a scripted way eg `docker run justincormack/ljsyscall -e "print(require('syscall').nl.interfaces())"` or get an interactive session with `docker -it run justincormack/ljsyscall`. + The stable release is now available in the luarocks repository, so you should be able to run ```luarocks install ljsyscall```. There will be a ```ljsyscall-rump``` rock soon, but I need to fix the install for the rump libraries. For simple uses, you just need to put the ```.lua``` files somewhere that LuaJIT will find them, eg typically in ```/usr/local/share/lua/5.1/```. Keep the directory structure there is. You can safely remove files from architectures and operating systems you do not use. You can also install the head version using luarocks: ```luarocks install rockspec/ljsyscall-scm-1.rockspec``` . -It is also available as a package in [buildroot](http://buildroot.uclibc.org/), a build system for embedded systems, and in [pkgsrc](http://www.pkgsrc.org] the portable packaging system for many systems. +It is also available as a package in [buildroot](http://buildroot.uclibc.org/), a build system for embedded systems, and in [pkgsrc](http://www.pkgsrc.org] the portable packaging system for many systems. It is now packaged for [Alpine Linux](http://www.alpinelinux.org/), in the testing repository. If you are using Lua rather than LuaJIT you need to install [luaffi](/~https://github.com/jmckaskill/luaffi) first; this is largely working now, but there will be more support for standard Lua coming soon. @@ -66,6 +68,7 @@ This project is being used in a variety of places, such as for testing the Linux * [buildroot](http://buildroot.uclibc.org/) has an ljsyscall package. * [luatz](/~https://github.com/daurnimator/luatz) uses ljsyscall when available * [Snabb switch](/~https://github.com/SnabbCo/snabbswitch) a high performance networking toolkit. +* [Spook](/~https://github.com/johnae/spook) started out as an fs events based test runner similar to Rubys guard but grew into an event toolkit of sorts. ## Testing diff --git a/lib/ljsyscall/debian/changelog b/lib/ljsyscall/debian/changelog new file mode 100644 index 0000000000..f44a5b608a --- /dev/null +++ b/lib/ljsyscall/debian/changelog @@ -0,0 +1,5 @@ +lua-ljsyscall (0.12-1) unstable; urgency=medium + + * UNRELEASED + + -- John Doe Sun, 23 Jul 2017 19:43:15 +0200 diff --git a/lib/ljsyscall/debian/compat b/lib/ljsyscall/debian/compat new file mode 100644 index 0000000000..ec635144f6 --- /dev/null +++ b/lib/ljsyscall/debian/compat @@ -0,0 +1 @@ +9 diff --git a/lib/ljsyscall/debian/control b/lib/ljsyscall/debian/control new file mode 100644 index 0000000000..154d215980 --- /dev/null +++ b/lib/ljsyscall/debian/control @@ -0,0 +1,25 @@ +Source: lua-ljsyscall +Section: interpreters +Priority: optional +Maintainer: nobody +Build-Depends: debhelper (>= 9), dh-lua +Standards-Version: 4.0.0 +Homepage: http://www.myriabit.com/ljsyscall/ + +Package: lua-ljsyscall +Architecture: all +Pre-Depends: ${misc:Pre-Depends} +Depends: luajit, ${misc:Depends} +Provides: ${lua:Provides} +XB-Lua-Versions: ${lua:Versions} +Description: LuaJIT Linux syscall FFI + +Package: lua-ljsyscall-dev +Section: libdevel +Architecture: all +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends} +Provides: ${lua:Provides} +XB-Lua-Versions: ${lua:Versions} +Description: ljsyscall doc + This package contains the documentation of the ljsyscall library. diff --git a/lib/ljsyscall/debian/copyright b/lib/ljsyscall/debian/copyright new file mode 100644 index 0000000000..679922b416 --- /dev/null +++ b/lib/ljsyscall/debian/copyright @@ -0,0 +1,30 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: ljsyscall +Source: /~https://github.com/justincormack/ljsyscall + +Files: * +Copyright: Copyright (C) 2011-2016 Justin Cormack. All rights reserved. +License: Expat + +Files: */doc +Copyright: Copyright (C) 2011-2016 Justin Cormack. All rights reserved. +License: CC0 + +License: Expat + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. diff --git a/lib/ljsyscall/debian/dh-lua.conf b/lib/ljsyscall/debian/dh-lua.conf new file mode 100644 index 0000000000..19d2316714 --- /dev/null +++ b/lib/ljsyscall/debian/dh-lua.conf @@ -0,0 +1,24 @@ +### mandatory fields +LUA_VERSION=5.1 +PKG_NAME=ljsyscall + +### things relative to the C library part +CLIB_CFLAGS= +CLIB_LDFLAGS= +CLIB_LDFLAGS_STATIC= +CLIB_OBJS= +LUA_MODNAME_CPART= + +### things relative to the lua library part +LUA_HEADER= +LUA_SOURCES=syscall.lua syscall/*.lua syscall/shared/*.lua syscall/linux/*.lua syscall/linux/*/*.lua +LUA_SOURCES_MANGLER= +LUA_MODNAME=syscall +LUA_TEST= + +### this part is relative to pkg-config +PKG_VERSION= +PKG_LIBS_PRIVATE= +PKG_URL= +PKG_REQUIRES= +PKG_CONFLICTS= diff --git a/lib/ljsyscall/debian/lua-ljsyscall-dev.docs b/lib/ljsyscall/debian/lua-ljsyscall-dev.docs new file mode 100644 index 0000000000..ea60385cf8 --- /dev/null +++ b/lib/ljsyscall/debian/lua-ljsyscall-dev.docs @@ -0,0 +1,2 @@ +doc +test diff --git a/lib/ljsyscall/debian/patches/series b/lib/ljsyscall/debian/patches/series new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/ljsyscall/debian/rules b/lib/ljsyscall/debian/rules new file mode 100755 index 0000000000..4f36696ce5 --- /dev/null +++ b/lib/ljsyscall/debian/rules @@ -0,0 +1,4 @@ +#!/usr/bin/make -f + +%: + dh $@ --buildsystem=lua --with lua diff --git a/lib/ljsyscall/debian/source/format b/lib/ljsyscall/debian/source/format new file mode 100644 index 0000000000..163aaf8d82 --- /dev/null +++ b/lib/ljsyscall/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/lib/ljsyscall/debian/tests/control b/lib/ljsyscall/debian/tests/control new file mode 100644 index 0000000000..cdb0fa9909 --- /dev/null +++ b/lib/ljsyscall/debian/tests/control @@ -0,0 +1,3 @@ +Tests: dh-lua-tests +Restrictions: rw-build-tree +Depends: @, dh-lua diff --git a/lib/ljsyscall/debian/tests/dh-lua-tests b/lib/ljsyscall/debian/tests/dh-lua-tests new file mode 100644 index 0000000000..738a2eb7ce --- /dev/null +++ b/lib/ljsyscall/debian/tests/dh-lua-tests @@ -0,0 +1 @@ +debian/rules autopkgtest diff --git a/lib/ljsyscall/debian/watch b/lib/ljsyscall/debian/watch new file mode 100644 index 0000000000..39da9e737d --- /dev/null +++ b/lib/ljsyscall/debian/watch @@ -0,0 +1,6 @@ +# test this watch file using: +# uscan --watchfile debian/watch --upstream-version 0.1 --package lua-ljsyscall +# +version=3 +opts=filenamemangle=s/.+\/v?(\d\S*)\.tar\.gz/ljsyscall-$1\.tar\.gz/ \ + /~https://github.com/justincormack/ljsyscall/releases .*/v?(\d\S*)\.tar\.gz diff --git a/lib/ljsyscall/docker-compose.test.yml b/lib/ljsyscall/docker-compose.test.yml new file mode 100644 index 0000000000..300cd4b981 --- /dev/null +++ b/lib/ljsyscall/docker-compose.test.yml @@ -0,0 +1,5 @@ +sut: + build: . + command: /test/test.lua + volumes: + - ./test:/test diff --git a/lib/ljsyscall/rockspec/ljsyscall-0.11-1.rockspec b/lib/ljsyscall/rockspec/ljsyscall-0.11-1.rockspec new file mode 100644 index 0000000000..2b66396d20 --- /dev/null +++ b/lib/ljsyscall/rockspec/ljsyscall-0.11-1.rockspec @@ -0,0 +1,170 @@ +package = "ljsyscall" +version = "0.11-1" +source = +{ + url = "/~https://github.com/justincormack/ljsyscall/archive/v0.11.tar.gz"; + dir = "ljsyscall-0.11"; +} + +description = +{ + summary = "LuaJIT Linux syscall FFI"; + homepage = "http://www.myriabit.com/ljsyscall/"; + license = "MIT"; +} +dependencies = +{ + "lua == 5.1"; -- In fact this should be "luajit >= 2.0.0" +} +build = +{ + type = "builtin"; + modules = + { + ["syscall"] = "syscall.lua"; + ["syscall.abi"] = "syscall/abi.lua"; + ["syscall.helpers"] = "syscall/helpers.lua"; + ["syscall.syscalls"] = "syscall/syscalls.lua"; + ["syscall.libc"] = "syscall/libc.lua"; + ["syscall.methods"] = "syscall/methods.lua"; + ["syscall.ffitypes"] = "syscall/ffitypes.lua"; + ["syscall.util"] = "syscall/util.lua"; + ["syscall.compat"] = "syscall/compat.lua"; + ["syscall.bit"] = "syscall/bit.lua"; + ["syscall.types"] = "syscall/types.lua"; + ["syscall.lfs"] = "syscall/lfs.lua"; + + ["syscall.shared.types"] = "syscall/shared/types.lua"; + }; + platforms = + { + linux = + { + modules = { + ["syscall.linux.syscalls"] = "syscall/linux/syscalls.lua"; + ["syscall.linux.c"] = "syscall/linux/c.lua"; + ["syscall.linux.constants"] = "syscall/linux/constants.lua"; + ["syscall.linux.ffi"] = "syscall/linux/ffi.lua"; + ["syscall.linux.ioctl"] = "syscall/linux/ioctl.lua"; + ["syscall.linux.types"] = "syscall/linux/types.lua"; + ["syscall.linux.fcntl"] = "syscall/linux/fcntl.lua"; + ["syscall.linux.errors"] = "syscall/linux/errors.lua"; + ["syscall.linux.util"] = "syscall/linux/util.lua"; + ["syscall.linux.nr"] = "syscall/linux/nr.lua"; + + ["syscall.linux.nl"] = "syscall/linux/nl.lua"; + ["syscall.linux.netfilter"] = "syscall/linux/netfilter.lua"; + ["syscall.linux.sockopt"] = "syscall/linux/sockopt.lua"; + ["syscall.linux.cgroup"] = "syscall/linux/cgroup.lua"; + + ["syscall.linux.arm.constants"] = "syscall/linux/arm/constants.lua"; + ["syscall.linux.arm.ffi"] = "syscall/linux/arm/ffi.lua"; + ["syscall.linux.arm.ioctl"] = "syscall/linux/arm/ioctl.lua"; + ["syscall.linux.arm.nr"] = "syscall/linux/arm/nr.lua"; + ["syscall.linux.arm64.constants"] = "syscall/linux/arm64/constants.lua"; + ["syscall.linux.arm64.ffi"] = "syscall/linux/arm64/ffi.lua"; + ["syscall.linux.arm64.ioctl"] = "syscall/linux/arm64/ioctl.lua"; + ["syscall.linux.arm64.nr"] = "syscall/linux/arm64/nr.lua"; + ["syscall.linux.mips.constants"] = "syscall/linux/mips/constants.lua"; + ["syscall.linux.mips.ffi"] = "syscall/linux/mips/ffi.lua"; + ["syscall.linux.mips.ioctl"] = "syscall/linux/mips/ioctl.lua"; + ["syscall.linux.mips.nr"] = "syscall/linux/mips/nr.lua"; + ["syscall.linux.ppc.constants"] = "syscall/linux/ppc/constants.lua"; + ["syscall.linux.ppc.ffi"] = "syscall/linux/ppc/ffi.lua"; + ["syscall.linux.ppc.ioctl"] = "syscall/linux/ppc/ioctl.lua"; + ["syscall.linux.ppc.nr"] = "syscall/linux/ppc/nr.lua"; + ["syscall.linux.ppc64le.constants"] = "syscall/linux/ppc64le/constants.lua"; + ["syscall.linux.ppc64le.ffi"] = "syscall/linux/ppc64le/ffi.lua"; + ["syscall.linux.ppc64le.ioctl"] = "syscall/linux/ppc64le/ioctl.lua"; + ["syscall.linux.ppc64le.nr"] = "syscall/linux/ppc64le/nr.lua"; + ["syscall.linux.x64.constants"] = "syscall/linux/x64/constants.lua"; + ["syscall.linux.x64.ffi"] = "syscall/linux/x64/ffi.lua"; + ["syscall.linux.x64.ioctl"] = "syscall/linux/x64/ioctl.lua"; + ["syscall.linux.x64.nr"] = "syscall/linux/x64/nr.lua"; + ["syscall.linux.x86.constants"] = "syscall/linux/x86/constants.lua"; + ["syscall.linux.x86.ffi"] = "syscall/linux/x86/ffi.lua"; + ["syscall.linux.x86.ioctl"] = "syscall/linux/x86/ioctl.lua"; + ["syscall.linux.x86.nr"] = "syscall/linux/x86/nr.lua"; + } + }; + macosx = + { + modules = + { + ["syscall.osx.syscalls"] = "syscall/osx/syscalls.lua"; + ["syscall.osx.c"] = "syscall/osx/c.lua"; + ["syscall.osx.constants"] = "syscall/osx/constants.lua"; + ["syscall.osx.ffi"] = "syscall/osx/ffi.lua"; + ["syscall.osx.ioctl"] = "syscall/osx/ioctl.lua"; + ["syscall.osx.types"] = "syscall/osx/types.lua"; + ["syscall.osx.fcntl"] = "syscall/osx/fcntl.lua"; + ["syscall.osx.errors"] = "syscall/osx/errors.lua"; + ["syscall.osx.util"] = "syscall/osx/util.lua"; + ["syscall.osx.sysctl"] = "syscall/osx/sysctl.lua"; + } + }; + freebsd = + { + modules = + { + ["syscall.freebsd.syscalls"] = "syscall/freebsd/syscalls.lua"; + ["syscall.freebsd.c"] = "syscall/freebsd/c.lua"; + ["syscall.freebsd.constants"] = "syscall/freebsd/constants.lua"; + ["syscall.freebsd.ffi"] = "syscall/freebsd/ffi.lua"; + ["syscall.freebsd.ioctl"] = "syscall/freebsd/ioctl.lua"; + ["syscall.freebsd.types"] = "syscall/freebsd/types.lua"; + ["syscall.freebsd.fcntl"] = "syscall/freebsd/fcntl.lua"; + ["syscall.freebsd.errors"] = "syscall/freebsd/errors.lua"; + ["syscall.freebsd.util"] = "syscall/freebsd/util.lua"; + ["syscall.freebsd.version"] = "syscall/freebsd/version.lua"; + ["syscall.freebsd.sysctl"] = "syscall/freebsd/sysctl.lua"; + } + }; + netbsd = + { + modules = + { + ["syscall.netbsd.syscalls"] = "syscall/netbsd/syscalls.lua"; + ["syscall.netbsd.c"] = "syscall/netbsd/c.lua"; + ["syscall.netbsd.constants"] = "syscall/netbsd/constants.lua"; + ["syscall.netbsd.ffitypes"] = "syscall/netbsd/ffitypes.lua"; + ["syscall.netbsd.ffifunctions"] = "syscall/netbsd/ffifunctions.lua"; + ["syscall.netbsd.ioctl"] = "syscall/netbsd/ioctl.lua"; + ["syscall.netbsd.types"] = "syscall/netbsd/types.lua"; + ["syscall.netbsd.fcntl"] = "syscall/netbsd/fcntl.lua"; + ["syscall.netbsd.errors"] = "syscall/netbsd/errors.lua"; + ["syscall.netbsd.util"] = "syscall/netbsd/util.lua"; + ["syscall.netbsd.nr"] = "syscall/netbsd/nr.lua"; + ["syscall.netbsd.init"] = "syscall/netbsd/init.lua"; + ["syscall.netbsd.version"] = "syscall/netbsd/version.lua"; + ["syscall.netbsd.sysctl"] = "syscall/netbsd/sysctl.lua"; + } + }; + openbsd = + { + modules = + { + ["syscall.openbsd.syscalls"] = "syscall/openbsd/syscalls.lua"; + ["syscall.openbsd.c"] = "syscall/openbsd/c.lua"; + ["syscall.openbsd.constants"] = "syscall/openbsd/constants.lua"; + ["syscall.openbsd.ffi"] = "syscall/openbsd/ffi.lua"; + ["syscall.openbsd.ioctl"] = "syscall/openbsd/ioctl.lua"; + ["syscall.openbsd.types"] = "syscall/openbsd/types.lua"; + ["syscall.openbsd.fcntl"] = "syscall/openbsd/fcntl.lua"; + ["syscall.openbsd.errors"] = "syscall/openbsd/errors.lua"; + ["syscall.openbsd.util"] = "syscall/openbsd/util.lua"; + ["syscall.openbsd.version"] = "syscall/openbsd/version.lua"; + ["syscall.openbsd.sysctl"] = "syscall/openbsd/sysctl.lua"; + } + }; + bsd = + { + modules = + { + ["syscall.bsd.syscalls"] = "syscall/bsd/syscalls.lua"; + ["syscall.bsd.ffi"] = "syscall/bsd/ffi.lua"; + ["syscall.bsd.types"] = "syscall/bsd/types.lua"; + } + }; + } +} diff --git a/lib/ljsyscall/rockspec/ljsyscall-0.12-1.rockspec b/lib/ljsyscall/rockspec/ljsyscall-0.12-1.rockspec new file mode 100644 index 0000000000..f614e71605 --- /dev/null +++ b/lib/ljsyscall/rockspec/ljsyscall-0.12-1.rockspec @@ -0,0 +1,170 @@ +package = "ljsyscall" +version = "0.12-1" +source = +{ + url = "/~https://github.com/justincormack/ljsyscall/archive/v0.12.tar.gz"; + dir = "ljsyscall-0.12"; +} + +description = +{ + summary = "LuaJIT Linux syscall FFI"; + homepage = "http://www.myriabit.com/ljsyscall/"; + license = "MIT"; +} +dependencies = +{ + "lua == 5.1"; -- In fact this should be "luajit >= 2.0.0" +} +build = +{ + type = "builtin"; + modules = + { + ["syscall"] = "syscall.lua"; + ["syscall.abi"] = "syscall/abi.lua"; + ["syscall.helpers"] = "syscall/helpers.lua"; + ["syscall.syscalls"] = "syscall/syscalls.lua"; + ["syscall.libc"] = "syscall/libc.lua"; + ["syscall.methods"] = "syscall/methods.lua"; + ["syscall.ffitypes"] = "syscall/ffitypes.lua"; + ["syscall.util"] = "syscall/util.lua"; + ["syscall.compat"] = "syscall/compat.lua"; + ["syscall.bit"] = "syscall/bit.lua"; + ["syscall.types"] = "syscall/types.lua"; + ["syscall.lfs"] = "syscall/lfs.lua"; + + ["syscall.shared.types"] = "syscall/shared/types.lua"; + }; + platforms = + { + linux = + { + modules = { + ["syscall.linux.syscalls"] = "syscall/linux/syscalls.lua"; + ["syscall.linux.c"] = "syscall/linux/c.lua"; + ["syscall.linux.constants"] = "syscall/linux/constants.lua"; + ["syscall.linux.ffi"] = "syscall/linux/ffi.lua"; + ["syscall.linux.ioctl"] = "syscall/linux/ioctl.lua"; + ["syscall.linux.types"] = "syscall/linux/types.lua"; + ["syscall.linux.fcntl"] = "syscall/linux/fcntl.lua"; + ["syscall.linux.errors"] = "syscall/linux/errors.lua"; + ["syscall.linux.util"] = "syscall/linux/util.lua"; + ["syscall.linux.nr"] = "syscall/linux/nr.lua"; + + ["syscall.linux.nl"] = "syscall/linux/nl.lua"; + ["syscall.linux.netfilter"] = "syscall/linux/netfilter.lua"; + ["syscall.linux.sockopt"] = "syscall/linux/sockopt.lua"; + ["syscall.linux.cgroup"] = "syscall/linux/cgroup.lua"; + + ["syscall.linux.arm.constants"] = "syscall/linux/arm/constants.lua"; + ["syscall.linux.arm.ffi"] = "syscall/linux/arm/ffi.lua"; + ["syscall.linux.arm.ioctl"] = "syscall/linux/arm/ioctl.lua"; + ["syscall.linux.arm.nr"] = "syscall/linux/arm/nr.lua"; + ["syscall.linux.arm64.constants"] = "syscall/linux/arm64/constants.lua"; + ["syscall.linux.arm64.ffi"] = "syscall/linux/arm64/ffi.lua"; + ["syscall.linux.arm64.ioctl"] = "syscall/linux/arm64/ioctl.lua"; + ["syscall.linux.arm64.nr"] = "syscall/linux/arm64/nr.lua"; + ["syscall.linux.mips.constants"] = "syscall/linux/mips/constants.lua"; + ["syscall.linux.mips.ffi"] = "syscall/linux/mips/ffi.lua"; + ["syscall.linux.mips.ioctl"] = "syscall/linux/mips/ioctl.lua"; + ["syscall.linux.mips.nr"] = "syscall/linux/mips/nr.lua"; + ["syscall.linux.ppc.constants"] = "syscall/linux/ppc/constants.lua"; + ["syscall.linux.ppc.ffi"] = "syscall/linux/ppc/ffi.lua"; + ["syscall.linux.ppc.ioctl"] = "syscall/linux/ppc/ioctl.lua"; + ["syscall.linux.ppc.nr"] = "syscall/linux/ppc/nr.lua"; + ["syscall.linux.ppc64le.constants"] = "syscall/linux/ppc64le/constants.lua"; + ["syscall.linux.ppc64le.ffi"] = "syscall/linux/ppc64le/ffi.lua"; + ["syscall.linux.ppc64le.ioctl"] = "syscall/linux/ppc64le/ioctl.lua"; + ["syscall.linux.ppc64le.nr"] = "syscall/linux/ppc64le/nr.lua"; + ["syscall.linux.x64.constants"] = "syscall/linux/x64/constants.lua"; + ["syscall.linux.x64.ffi"] = "syscall/linux/x64/ffi.lua"; + ["syscall.linux.x64.ioctl"] = "syscall/linux/x64/ioctl.lua"; + ["syscall.linux.x64.nr"] = "syscall/linux/x64/nr.lua"; + ["syscall.linux.x86.constants"] = "syscall/linux/x86/constants.lua"; + ["syscall.linux.x86.ffi"] = "syscall/linux/x86/ffi.lua"; + ["syscall.linux.x86.ioctl"] = "syscall/linux/x86/ioctl.lua"; + ["syscall.linux.x86.nr"] = "syscall/linux/x86/nr.lua"; + } + }; + macosx = + { + modules = + { + ["syscall.osx.syscalls"] = "syscall/osx/syscalls.lua"; + ["syscall.osx.c"] = "syscall/osx/c.lua"; + ["syscall.osx.constants"] = "syscall/osx/constants.lua"; + ["syscall.osx.ffi"] = "syscall/osx/ffi.lua"; + ["syscall.osx.ioctl"] = "syscall/osx/ioctl.lua"; + ["syscall.osx.types"] = "syscall/osx/types.lua"; + ["syscall.osx.fcntl"] = "syscall/osx/fcntl.lua"; + ["syscall.osx.errors"] = "syscall/osx/errors.lua"; + ["syscall.osx.util"] = "syscall/osx/util.lua"; + ["syscall.osx.sysctl"] = "syscall/osx/sysctl.lua"; + } + }; + freebsd = + { + modules = + { + ["syscall.freebsd.syscalls"] = "syscall/freebsd/syscalls.lua"; + ["syscall.freebsd.c"] = "syscall/freebsd/c.lua"; + ["syscall.freebsd.constants"] = "syscall/freebsd/constants.lua"; + ["syscall.freebsd.ffi"] = "syscall/freebsd/ffi.lua"; + ["syscall.freebsd.ioctl"] = "syscall/freebsd/ioctl.lua"; + ["syscall.freebsd.types"] = "syscall/freebsd/types.lua"; + ["syscall.freebsd.fcntl"] = "syscall/freebsd/fcntl.lua"; + ["syscall.freebsd.errors"] = "syscall/freebsd/errors.lua"; + ["syscall.freebsd.util"] = "syscall/freebsd/util.lua"; + ["syscall.freebsd.version"] = "syscall/freebsd/version.lua"; + ["syscall.freebsd.sysctl"] = "syscall/freebsd/sysctl.lua"; + } + }; + netbsd = + { + modules = + { + ["syscall.netbsd.syscalls"] = "syscall/netbsd/syscalls.lua"; + ["syscall.netbsd.c"] = "syscall/netbsd/c.lua"; + ["syscall.netbsd.constants"] = "syscall/netbsd/constants.lua"; + ["syscall.netbsd.ffitypes"] = "syscall/netbsd/ffitypes.lua"; + ["syscall.netbsd.ffifunctions"] = "syscall/netbsd/ffifunctions.lua"; + ["syscall.netbsd.ioctl"] = "syscall/netbsd/ioctl.lua"; + ["syscall.netbsd.types"] = "syscall/netbsd/types.lua"; + ["syscall.netbsd.fcntl"] = "syscall/netbsd/fcntl.lua"; + ["syscall.netbsd.errors"] = "syscall/netbsd/errors.lua"; + ["syscall.netbsd.util"] = "syscall/netbsd/util.lua"; + ["syscall.netbsd.nr"] = "syscall/netbsd/nr.lua"; + ["syscall.netbsd.init"] = "syscall/netbsd/init.lua"; + ["syscall.netbsd.version"] = "syscall/netbsd/version.lua"; + ["syscall.netbsd.sysctl"] = "syscall/netbsd/sysctl.lua"; + } + }; + openbsd = + { + modules = + { + ["syscall.openbsd.syscalls"] = "syscall/openbsd/syscalls.lua"; + ["syscall.openbsd.c"] = "syscall/openbsd/c.lua"; + ["syscall.openbsd.constants"] = "syscall/openbsd/constants.lua"; + ["syscall.openbsd.ffi"] = "syscall/openbsd/ffi.lua"; + ["syscall.openbsd.ioctl"] = "syscall/openbsd/ioctl.lua"; + ["syscall.openbsd.types"] = "syscall/openbsd/types.lua"; + ["syscall.openbsd.fcntl"] = "syscall/openbsd/fcntl.lua"; + ["syscall.openbsd.errors"] = "syscall/openbsd/errors.lua"; + ["syscall.openbsd.util"] = "syscall/openbsd/util.lua"; + ["syscall.openbsd.version"] = "syscall/openbsd/version.lua"; + ["syscall.openbsd.sysctl"] = "syscall/openbsd/sysctl.lua"; + } + }; + bsd = + { + modules = + { + ["syscall.bsd.syscalls"] = "syscall/bsd/syscalls.lua"; + ["syscall.bsd.ffi"] = "syscall/bsd/ffi.lua"; + ["syscall.bsd.types"] = "syscall/bsd/types.lua"; + } + }; + } +} diff --git a/lib/ljsyscall/syscall/bsd/ffi.lua b/lib/ljsyscall/syscall/bsd/ffi.lua index a09b59816f..a0fed50a4f 100644 --- a/lib/ljsyscall/syscall/bsd/ffi.lua +++ b/lib/ljsyscall/syscall/bsd/ffi.lua @@ -147,7 +147,6 @@ int mkfifoat(int dirfd, const char *pathname, mode_t mode); int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags); int readlinkat(int dirfd, const char *pathname, char *buf, size_t bufsiz); int faccessat(int dirfd, const char *pathname, int mode, int flags); -int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); int futimens(int fd, const struct timespec times[2]); int utimensat(int dirfd, const char *pathname, const struct timespec times[2], int flags); diff --git a/lib/ljsyscall/syscall/freebsd/constants.lua b/lib/ljsyscall/syscall/freebsd/constants.lua index 305a8cb151..b1a703da03 100644 --- a/lib/ljsyscall/syscall/freebsd/constants.lua +++ b/lib/ljsyscall/syscall/freebsd/constants.lua @@ -1335,5 +1335,13 @@ c.CAP_RIGHTS_VERSION = 0 -- we do not understand others end -- freebsd >= 10 +if version >= 11 then +-- for utimensat +c.UTIME = strflag { + NOW = -1, + OMIT = -2, +} +end + return c diff --git a/lib/ljsyscall/syscall/freebsd/ffi.lua b/lib/ljsyscall/syscall/freebsd/ffi.lua index 872237e771..fecc9f509b 100644 --- a/lib/ljsyscall/syscall/freebsd/ffi.lua +++ b/lib/ljsyscall/syscall/freebsd/ffi.lua @@ -297,6 +297,7 @@ int cap_ioctls_limit(int fd, const unsigned long *cmds, size_t ncmds); ssize_t cap_ioctls_get(int fd, unsigned long *cmds, size_t maxcmds); int cap_fcntls_limit(int fd, uint32_t fcntlrights); int cap_fcntls_get(int fd, uint32_t *fcntlrightsp); +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); int __sys_utimes(const char *filename, const struct timeval times[2]); int __sys_futimes(int, const struct timeval times[2]); diff --git a/lib/ljsyscall/syscall/linux/arm/nr.lua b/lib/ljsyscall/syscall/linux/arm/nr.lua index 058c8158d1..7fdb66d58a 100644 --- a/lib/ljsyscall/syscall/linux/arm/nr.lua +++ b/lib/ljsyscall/syscall/linux/arm/nr.lua @@ -338,6 +338,15 @@ local nr = { setns = 375, process_vm_readv = 376, process_vm_writev= 377, + kcmp = 378, + finit_module = 379, + sched_setattr = 380, + sched_getattr = 381, + renameat2 = 382, + seccomp = 383, + getrandom = 384, + memfd_create = 385, + bpf = 386, } } diff --git a/lib/ljsyscall/syscall/linux/c.lua b/lib/ljsyscall/syscall/linux/c.lua index 33c09e1425..e52a771a1a 100644 --- a/lib/ljsyscall/syscall/linux/c.lua +++ b/lib/ljsyscall/syscall/linux/c.lua @@ -7,12 +7,8 @@ Note a fair number are being deprecated, see include/uapi/asm-generic/unistd.h u Some of these we already don't use, but some we do, eg use open not openat etc. ]] -local require, error, assert, tonumber, tostring, -setmetatable, pairs, ipairs, unpack, rawget, rawset, -pcall, type, table, string, select = -require, error, assert, tonumber, tostring, -setmetatable, pairs, ipairs, unpack, rawget, rawset, -pcall, type, table, string, select +local require, tonumber, pcall, select = +require, tonumber, pcall, select local abi = require "syscall.abi" @@ -34,7 +30,6 @@ local uint, ulong = ffi.typeof("unsigned int"), ffi.typeof("unsigned long") local h = require "syscall.helpers" local err64 = h.err64 -local errpointer = h.errpointer local i6432, u6432 = bit.i6432, bit.u6432 @@ -53,7 +48,6 @@ else arg64u = function(val) return u6432(val) end end -- _llseek very odd, preadv -local function llarg64u(val) return u6432(val) end local function llarg64(val) return i6432(val) end local C = {} @@ -69,7 +63,6 @@ local u64 = ffi.typeof("uint64_t") -- TODO could make these return errno here, also are these best casts? local syscall_long = ffi.C.syscall -- returns long local function syscall(...) return tonumber(syscall_long(...)) end -- int is default as most common -local function syscall_uint(...) return uint(syscall_long(...)) end local function syscall_void(...) return void(syscall_long(...)) end local function syscall_off(...) return u64(syscall_long(...)) end -- off_t @@ -182,6 +175,7 @@ end -- glibc caches pid, but this fails to work eg after clone(). function C.getpid() return syscall(sys.getpid) end +function C.gettid() return syscall(sys.gettid) end -- underlying syscalls function C.exit_group(status) return syscall(sys.exit_group, int(status)) end -- void return really @@ -641,6 +635,7 @@ end function C.timer_gettime(timerid, curr_value) return syscall(sys.timer_gettime, int(timerid), void(curr_value)) end function C.timer_delete(timerid) return syscall(sys.timer_delete, int(timerid)) end function C.timer_getoverrun(timerid) return syscall(sys.timer_getoverrun, int(timerid)) end +function C.vhangup() return syscall(sys.vhangup) end -- only on some architectures if sys.waitpid then @@ -701,6 +696,18 @@ if sys.time then function C.time(t) return syscall(sys.time, void(t)) end end +-- bpf syscall that is only on Linux 3.19+ +if sys.bpf then + function C.bpf(cmd, attr) + return syscall(sys.bpf, int(cmd), void(attr), u64(ffi.sizeof('union bpf_attr'))) + end +end +if sys.perf_event_open then + function C.perf_event_open(attr, pid, cpu, group_fd, flags) + return syscall(sys.perf_event_open, void(attr), int(pid), int(cpu), int(group_fd), ulong(flags)) + end +end + -- socketcalls if not sys.socketcall then function C.socket(domain, tp, protocol) return syscall(sys.socket, int(domain), int(tp), int(protocol)) end diff --git a/lib/ljsyscall/syscall/linux/constants.lua b/lib/ljsyscall/syscall/linux/constants.lua index 3497885f92..ec0c32807b 100644 --- a/lib/ljsyscall/syscall/linux/constants.lua +++ b/lib/ljsyscall/syscall/linux/constants.lua @@ -161,9 +161,12 @@ c.F = strflag(arch.F or { SETLEASE = 1024, GETLEASE = 1025, NOTIFY = 1026, + CANCELLK = 1029, + DUPFD_CLOEXEC = 1030, SETPIPE_SZ = 1031, GETPIPE_SZ = 1032, - DUPFD_CLOEXEC = 1030, + ADD_SEALS = 1033, + GET_SEALS = 1034, }) -- messy @@ -208,6 +211,14 @@ c.LOCK = multiflags { RW = 192, } +-- for memfd +c.F_SEAL = multiflags { + SEAL = 0x0001, + SHRINK = 0x0002, + GROW = 0x0004, + WRITE = 0x0008, +} + --mmap c.PROT = multiflags { NONE = 0x0, @@ -300,6 +311,8 @@ c.SEEK = strflag { SET = 0, CUR = 1, END = 2, + DATA = 3, + HOLE = 4, } -- exit @@ -393,6 +406,12 @@ c.SOCK = multiflags(arch.SOCK or { c.SCM = strflag { RIGHTS = 0x01, CREDENTIALS = 0x02, + + TSTAMP_SND = 0, + TSTAMP_SCHED = 1, + TSTAMP_ACK = 2, + + TIMESTAMPING_OPT_STATS = 54, } -- setsockopt @@ -407,6 +426,7 @@ c.SOL = strflag { ATM = 264, AAL = 265, IRDA = 266, + XDP = 283 } if arch.SOLSOCKET then c.SOL.SOCKET = arch.SOLSOCKET else c.SOL.SOCKET = 1 end @@ -426,7 +446,7 @@ c.SO = strflag(arch.SO or { PRIORITY = 12, LINGER = 13, BSDCOMPAT = 14, ---REUSEPORT = 15, -- new, may not be defined yet + REUSEPORT = 15, -- new, may not be defined yet PASSCRED = 16, PEERCRED = 17, RCVLOWAT = 18, @@ -455,13 +475,57 @@ c.SO = strflag(arch.SO or { WIFI_STATUS = 41, PEEK_OFF = 42, NOFCS = 43, + LOCK_FILTER = 44, + SELECT_ERR_QUEUE = 45, + BUSY_POLL = 46, + MAX_PACING_RATE = 47, + BPF_EXTENSIONS = 48, + INCOMING_CPU = 49, + ATTACH_BPF = 50, + ATTACH_REUSEPORT_CBPF = 51, + ATTACH_REUSEPORT_EBPF = 52, + XDP_MMAP_OFFSETS = 1, + XDP_RX_RING = 2, + XDP_TX_RING = 3, + XDP_UMEM_REG = 4, + XDP_UMEM_FILL_RING = 5, + XDP_UMEM_COMPLETION_RING = 6, + XDP_STATISTICS = 7, + XDP_OPTIONS = 8 }) c.SO.GET_FILTER = c.SO.ATTACH_FILTER +c.SO.DETACH_BPF = c.SO.DETACH_FILTER + +c.SCM.TIMESTAMP = c.SO.TIMESTAMP +c.SCM.TIMESTAMPNS = c.SO.TIMESTAMPNS +c.SCM.TIMESTAMPING = c.SO.TIMESTAMPING -- Maximum queue length specifiable by listen. c.SOMAXCONN = 128 +c.SOF = strflag { + TIMESTAMPING_TX_HARDWARE = bit.lshift(1, 0), + TIMESTAMPING_TX_SOFTWARE = bit.lshift(1, 1), + TIMESTAMPING_RX_HARDWARE = bit.lshift(1, 2), + TIMESTAMPING_RX_SOFTWARE = bit.lshift(1, 3), + TIMESTAMPING_SOFTWARE = bit.lshift(1, 4), + TIMESTAMPING_SYS_HARDWARE = bit.lshift(1, 5), + TIMESTAMPING_RAW_HARDWARE = bit.lshift(1, 6), + TIMESTAMPING_OPT_ID = bit.lshift(1, 7), + TIMESTAMPING_TX_SCHED = bit.lshift(1, 8), + TIMESTAMPING_TX_ACK = bit.lshift(1, 9), + TIMESTAMPING_OPT_CMSG = bit.lshift(1, 10), + TIMESTAMPING_OPT_TSONLY = bit.lshift(1, 11), + TIMESTAMPING_OPT_STATS = bit.lshift(1, 12), + TIMESTAMPING_OPT_PKTINFO = bit.lshift(1, 13), + TIMESTAMPING_OPT_TX_SWHW = bit.lshift(1, 14), +} + +c.SOF.TIMESTAMPING_LAST = c.SOF.TIMESTAMPING_OPT_TX_SWHW +c.SOF.TIMESTAMPING_MASK = bit.bor(c.SOF.TIMESTAMPING_LAST - 1, + c.SOF.TIMESTAMPING_LAST) + -- shutdown c.SHUT = strflag { RD = 0, @@ -1162,6 +1226,10 @@ c.RTA = strflag { MP_ALGO = 14, TABLE = 15, MARK = 16, + MFC_STATS = 17, + VIA = 18, + NEWDST = 19, + PREF = 20, } -- route flags @@ -1374,6 +1442,7 @@ c.AF = strflag { CAIF = 37, ALG = 38, NFC = 39, + XDP = 44 } c.AF.UNIX = c.AF.LOCAL @@ -1954,6 +2023,7 @@ c.EM = strflag { MN10300 = 89, BLACKFIN = 106, TI_C6000 = 140, + AARCH64 = 183, FRV = 0x5441, AVR32 = 0x18ad, ALPHA = 0x9026, @@ -1970,6 +2040,7 @@ local __AUDIT_ARCH_64BIT = 0x80000000 local __AUDIT_ARCH_LE = 0x40000000 c.AUDIT_ARCH = strflag { + AARCH64 = c.EM.AARCH64 + __AUDIT_ARCH_64BIT + __AUDIT_ARCH_LE, ALPHA = c.EM.ALPHA + __AUDIT_ARCH_64BIT + __AUDIT_ARCH_LE, ARM = c.EM.ARM + __AUDIT_ARCH_LE, ARMEB = c.EM.ARM, @@ -2007,6 +2078,7 @@ c.BPF = multiflags { ST = 0x02, STX = 0x03, ALU = 0x04, + ALU64 = 0x07, JMP = 0x05, RET = 0x06, MISC = 0x07, @@ -2014,6 +2086,7 @@ c.BPF = multiflags { W = 0x00, H = 0x08, B = 0x10, + DW = 0x18, -- mode IMM = 0x00, ABS = 0x20, @@ -2030,12 +2103,23 @@ c.BPF = multiflags { AND = 0x50, LSH = 0x60, RSH = 0x70, + ARSH = 0xc0, NEG = 0x80, + MOD = 0x90, + XOR = 0xa0, + MOV = 0xb0, + XADD = 0xc0, + END = 0xd0, JA = 0x00, JEQ = 0x10, JGT = 0x20, JGE = 0x30, JSET = 0x40, + JNE = 0x50, + JSGT = 0x60, + JSGE = 0x70, + CALL = 0x80, + EXIT = 0x90, -- src K = 0x00, X = 0x08, @@ -2044,6 +2128,245 @@ c.BPF = multiflags { -- miscop TAX = 0x00, TXA = 0x80, + TO_LE = 0x00, + TO_BE = 0x08, +-- flags + ANY = 0, + NOEXIST = 1, + EXIST = 2, +} + +-- BPF map type +c.BPF_MAP = strflag { + UNSPEC = 0, + HASH = 1, + ARRAY = 2, + PROG_ARRAY = 3, + PERF_EVENT_ARRAY = 4, + PERCPU_HASH = 5, + PERCPU_ARRAY = 6, + STACK_TRACE = 7, + CGROUP_ARRAY = 8, + LRU_HASH = 9, + LRU_PERCPU_HASH = 10, + LPM_TRIE = 11, + ARRAY_OF_MAPS = 12, + HASH_OF_MAPS = 13, + DEVMAP = 14, + SOCKMAP = 15, + CPUMAP = 16, + XSKMAP = 17 +} + +-- BPF syscall commands +c.BPF_CMD = strflag { + MAP_CREATE = 0, + MAP_LOOKUP_ELEM = 1, + MAP_UPDATE_ELEM = 2, + MAP_DELETE_ELEM = 3, + MAP_GET_NEXT_KEY = 4, + PROG_LOAD = 5, + OBJ_PIN = 6, + OBJ_GET = 7, + PROG_ATTACH = 8, + PROG_DETACH = 9, + PROG_TEST_RUN = 10, + PROG_GET_NEXT_ID = 11, + MAP_GET_NEXT_ID = 12, + PROG_GET_FD_BY_ID = 13, + MAP_GET_FD_BY_ID = 14, + OBJ_GET_INFO_BY_FD = 15, + PROG_QUERY = 16, + RAW_TRACEPOINT_OPEN = 17, +} + +-- BPF program types +c.BPF_PROG = strflag { + UNSPEC = 0, + SOCKET_FILTER = 1, + KPROBE = 2, + SCHED_CLS = 3, + SCHED_ACT = 4, + TRACEPOINT = 5, + XDP = 6, + PERF_EVENT = 7, + CGROUP_SKB = 8, + CGROUP_SOCK = 9, + LWT_IN = 10, + LWT_OUT = 11, + LWT_XMIT = 12, + SOCK_OPS = 13, + SK_SKB = 14, + CGROUP_DEVICE = 15, + SK_MSG = 16, + RAW_TRACEPOINT = 17, + CGROUP_SOCK_ADDR = 18, +} + +-- BPF attach type +c.BPF_ATTACH_TYPE = strflag { + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS = 1, + CGROUP_INET_SOCK_CREATE = 2, + CGROUP_SOCK_OPS = 3, + SK_SKB_STREAM_PARSER = 4, + SK_SKB_STREAM_VERDICT = 5, + CGROUP_DEVICE = 6, + SK_MSG_VERDICT = 7, + CGROUP_INET4_BIND = 8, + CGROUP_INET6_BIND = 9, + CGROUP_INET4_CONNECT = 10, + CGROUP_INET6_CONNECT = 11, + CGROUP_INET4_POST_BIND = 12, + CGROUP_INET6_POST_BIND = 13, +} + +-- Linux performance monitoring +-- perf_event_attr.type +c.PERF_TYPE = strflag { + HARDWARE = 0, + SOFTWARE = 1, + TRACEPOINT = 2, + HW_CACHE = 3, + RAW = 4, + BREAKPOINT = 5, +} + +-- perf_event_attr.event_id +c.PERF_COUNT = strflag { + -- Generalized performance event event_id types + HW_CPU_CYCLES = 0, + HW_INSTRUCTIONS = 1, + HW_CACHE_REFERENCES = 2, + HW_CACHE_MISSES = 3, + HW_BRANCH_INSTRUCTIONS = 4, + HW_BRANCH_MISSES = 5, + HW_BUS_CYCLES = 6, + HW_STALLED_CYCLES_FRONTEND = 7, + HW_STALLED_CYCLES_BACKEND = 8, + HW_REF_CPU_CYCLES = 9, + -- Generalized hardware cache events + HW_CACHE_L1D = 0, + HW_CACHE_L1I = 1, + HW_CACHE_LL = 2, + HW_CACHE_DTLB = 3, + HW_CACHE_ITLB = 4, + HW_CACHE_BPU = 5, + HW_CACHE_NODE = 6, + HW_CACHE_OP_READ = 0, + HW_CACHE_OP_WRITE = 1, + HW_CACHE_OP_PREFETCH = 2, + HW_CACHE_RESULT_ACCESS = 0, + HW_CACHE_RESULT_MISS = 1, + -- Special "software" events provided by the kernel + SW_CPU_CLOCK = 0, + SW_TASK_CLOCK = 1, + SW_PAGE_FAULTS = 2, + SW_CONTEXT_SWITCHES = 3, + SW_CPU_MIGRATIONS = 4, + SW_PAGE_FAULTS_MIN = 5, + SW_PAGE_FAULTS_MAJ = 6, + SW_ALIGNMENT_FAULTS = 7, + SW_EMULATION_FAULTS = 8, + SW_DUMMY = 9, + SW_BPF_OUTPUT = 10, +} + +-- Bits that can be set in perf_event_attr.sample_type to request information +c.PERF_SAMPLE = multiflags { + IP = bit.lshift(1, 0), + TID = bit.lshift(1, 1), + TIME = bit.lshift(1, 2), + ADDR = bit.lshift(1, 3), + READ = bit.lshift(1, 4), + CALLCHAIN = bit.lshift(1, 5), + ID = bit.lshift(1, 6), + CPU = bit.lshift(1, 7), + PERIOD = bit.lshift(1, 8), + STREAM_ID = bit.lshift(1, 9), + RAW = bit.lshift(1, 10), + BRANCH_STACK = bit.lshift(1, 11), + REGS_USER = bit.lshift(1, 12), + STACK_USER = bit.lshift(1, 13), + WEIGHT = bit.lshift(1, 14), + DATA_SRC = bit.lshift(1, 15), + IDENTIFIER = bit.lshift(1, 16), + TRANSACTION = bit.lshift(1, 17), + REGS_INTR = bit.lshift(1, 18), +} + +-- values to program into perf_event_attr.branch_sample_type when PERF_SAMPLE_BRANCH is set +c.PERF_SAMPLE_BRANCH = multiflags { + USER_SHIFT = 0, + KERNEL_SHIFT = 1, + HV_SHIFT = 2, + ANY_SHIFT = 3, + ANY_CALL_SHIFT = 4, + ANY_RETURN_SHIFT = 5, + IND_CALL_SHIFT = 6, + ABORT_TX_SHIFT = 7, + IN_TX_SHIFT = 8, + NO_TX_SHIFT = 9, + COND_SHIFT = 10, + CALL_STACK_SHIFT = 11, + IND_JUMP_SHIFT = 12, + CALL_SHIFT = 13, + NO_FLAGS_SHIFT = 14, + NO_CYCLES_SHIFT = 15, +} +c.PERF_SAMPLE_BRANCH.USER = bit.lshift(1, c.PERF_SAMPLE_BRANCH.USER_SHIFT) +c.PERF_SAMPLE_BRANCH.KERNEL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.KERNEL_SHIFT) +c.PERF_SAMPLE_BRANCH.HV = bit.lshift(1, c.PERF_SAMPLE_BRANCH.HV_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY_RETURN = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_RETURN_SHIFT) +c.PERF_SAMPLE_BRANCH.IND_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.ABORT_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ABORT_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.IN_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IN_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.COND = bit.lshift(1, c.PERF_SAMPLE_BRANCH.COND_SHIFT) +c.PERF_SAMPLE_BRANCH.CALL_STACK = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_STACK_SHIFT) +c.PERF_SAMPLE_BRANCH.IND_JUMP = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_JUMP_SHIFT) +c.PERF_SAMPLE_BRANCH.CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_FLAGS = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_FLAGS_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_CYCLES = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_CYCLES_SHIFT) + +-- Flags for perf_attr.read_format +c.PERF_READ_FORMAT = multiflags { + TOTAL_TIME_ENABLED = bit.lshift(1, 0), + TOTAL_TIME_RUNNING = bit.lshift(1, 1), + ID = bit.lshift(1, 2), + GROUP = bit.lshift(1, 3), +} + +-- Flags for perf_event_open +c.PERF_FLAG = multiflags { + FD_NO_GROUP = bit.lshift(1, 0), + FD_OUTPUT = bit.lshift(1, 1), + PID_CGROUP = bit.lshift(1, 2), + FD_CLOEXEC = bit.lshift(1, 3), +} + + +-- If perf_event_attr.sample_id_all is set then all event types will +-- have the sample_type selected fields related to where/when +-- (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, IDENTIFIER) +c.PERF_RECORD = strflag { + MMAP = 1, + LOST = 2, + COMM = 3, + EXIT = 4, + THROTTLE = 5, + UNTHROTTLE = 6, + FORK = 7, + READ = 8, + SAMPLE = 9, + MMAP2 = 10, + AUX = 11, + ITRACE_START = 12, + LOST_SAMPLES = 13, + SWITCH = 14, + SWITCH_CPU_WIDE= 15, } -- termios - c_cc characters diff --git a/lib/ljsyscall/syscall/linux/fcntl.lua b/lib/ljsyscall/syscall/linux/fcntl.lua index dd6621ffc6..67567c25be 100644 --- a/lib/ljsyscall/syscall/linux/fcntl.lua +++ b/lib/ljsyscall/syscall/linux/fcntl.lua @@ -22,6 +22,7 @@ local fcntl = { [c.F.GETLK] = t.flock, [c.F.SETLK] = t.flock, [c.F.SETLKW] = t.flock, + [c.F.ADD_SEALS] = function(arg) return c.F_SEAL[arg] end, }, ret = { [c.F.DUPFD] = function(ret) return t.fd(ret) end, @@ -33,6 +34,7 @@ local fcntl = { [c.F.GETSIG] = function(ret) return tonumber(ret) end, [c.F.GETPIPE_SZ] = function(ret) return tonumber(ret) end, [c.F.GETLK] = function(ret, arg) return arg end, + [c.F.GET_SEALS] = function(ret) return tonumber(ret) end, } } diff --git a/lib/ljsyscall/syscall/linux/ffi.lua b/lib/ljsyscall/syscall/linux/ffi.lua index 084fa7cdcf..2df6267494 100644 --- a/lib/ljsyscall/syscall/linux/ffi.lua +++ b/lib/ljsyscall/syscall/linux/ffi.lua @@ -498,10 +498,144 @@ struct sock_filter { uint8_t jf; uint32_t k; }; +struct bpf_insn { + uint8_t code; /* opcode */ + uint8_t dst_reg:4; /* dest register */ + uint8_t src_reg:4; /* source register */ + uint16_t off; /* signed offset */ + uint32_t imm; /* signed immediate constant */ +}; struct sock_fprog { unsigned short len; struct sock_filter *filter; }; +union bpf_attr { + struct { + uint32_t map_type; + uint32_t key_size; + uint32_t value_size; + uint32_t max_entries; + }; + struct { + uint32_t map_fd; + uint64_t key __attribute__((aligned(8))); + union { + uint64_t value __attribute__((aligned(8))); + uint64_t next_key __attribute__((aligned(8))); + }; + uint64_t flags; + }; + struct { + uint32_t prog_type; + uint32_t insn_cnt; + uint64_t insns __attribute__((aligned(8))); + uint64_t license __attribute__((aligned(8))); + uint32_t log_level; + uint32_t log_size; + uint64_t log_buf __attribute__((aligned(8))); + uint32_t kern_version; + }; + struct { + uint64_t pathname __attribute__((aligned(8))); + uint32_t bpf_fd; + uint32_t file_flags; + }; +} __attribute__((aligned(8))); +struct perf_event_attr { + uint32_t pe_type; + uint32_t size; + uint64_t pe_config; + union { + uint64_t sample_period; + uint64_t sample_freq; + }; + uint64_t pe_sample_type; + uint64_t read_format; + uint32_t disabled:1, + inherit:1, + pinned:1, + exclusive:1, + exclude_user:1, + exclude_kernel:1, + exclude_hv:1, + exclude_idle:1, + mmap:1, + comm:1, + freq:1, + inherit_stat:1, + enable_on_exec:1, + task:1, + watermark:1, + precise_ip:2, + mmap_data:1, + sample_id_all:1, + exclude_host:1, + exclude_guest:1, + exclude_callchain_kernel:1, + exclude_callchain_user:1, + mmap2:1, + comm_exec:1, + use_clockid:1, + __reserved_1a:6; + uint32_t __reserved_1b; + union { + uint32_t wakeup_events; + uint32_t wakeup_watermark; + }; + uint32_t bp_type; + union { + uint64_t bp_addr; + uint64_t config1; + }; + union { + uint64_t bp_len; + uint64_t config2; + }; + uint64_t branch_sample_type; + uint64_t sample_regs_user; + uint32_t sample_stack_user; + int32_t clockid; + uint64_t sample_regs_intr; + uint32_t aux_watermark; + uint32_t __reserved_2; +}; +struct perf_event_mmap_page { + uint32_t version; + uint32_t compat_version; + uint32_t lock; + uint32_t index; + int64_t offset; + uint64_t time_enabled; + uint64_t time_running; + union { + uint64_t capabilities; + struct { + uint32_t cap_bit0 : 1, + cap_bit0_is_deprecated : 1, + cap_user_rdpmc : 1, + cap_user_time : 1, + cap_user_time_zero : 1; + }; + }; + uint16_t pmc_width; + uint16_t time_shift; + uint32_t time_mult; + uint64_t time_offset; + uint64_t __reserved[120]; + volatile uint64_t data_head; + volatile uint64_t data_tail; + volatile uint64_t data_offset; + volatile uint64_t data_size; + uint64_t aux_head; + uint64_t aux_tail; + uint64_t aux_offset; + uint64_t aux_size; +}; +struct perf_event_header { + uint32_t type; + uint16_t misc; + uint16_t size; +}; struct mq_attr { long mq_flags, mq_maxmsg, mq_msgsize, mq_curmsgs, __unused[4]; }; @@ -703,6 +837,9 @@ struct rusage { long ru_nvcsw; long ru_nivcsw; }; +struct scm_timestamping { + struct timespec ts[3]; +}; ]] append(arch.nsig or [[ diff --git a/lib/ljsyscall/syscall/linux/ioctl.lua b/lib/ljsyscall/syscall/linux/ioctl.lua index 7ab871c92b..c695dca2c3 100644 --- a/lib/ljsyscall/syscall/linux/ioctl.lua +++ b/lib/ljsyscall/syscall/linux/ioctl.lua @@ -193,6 +193,7 @@ local ioctl = strflag { SIOCSIFFLAGS = 0x8914, SIOCGIFMTU = 0x8921, SIOCSIFMTU = 0x8922, + SIOCSIFHWADDR = 0x8924, SIOCGIFHWADDR = 0x8927, SIOCGIFINDEX = 0x8933, @@ -271,6 +272,16 @@ local ioctl = strflag { -- from linux/vfio.h type is ';' base is 100 VFIO_GET_API_VERSION = vfio('NONE', 0), VFIO_CHECK_EXTENSION = vfio('WRITE', 1, "uint32"), +-- from linux/perf_event.h + PERF_EVENT_IOC_ENABLE = _IO('$', 0), + PERF_EVENT_IOC_DISABLE = _IO('$', 1), + PERF_EVENT_IOC_REFRESH = _IO('$', 2), + PERF_EVENT_IOC_RESET = _IO('$', 3), + PERF_EVENT_IOC_PERIOD = _IOW('$', 4, "uint64"), + PERF_EVENT_IOC_SET_OUTPUT= _IO('$', 5), + PERF_EVENT_IOC_SET_FILTER= _IOW('$', 6, "uintptr"), + PERF_EVENT_IOC_ID = _IOR('$', 7, "uint64_1"), + PERF_EVENT_IOC_SET_BPF = _IOW('$', 8, "uint32"), -- allow user defined ioctls _IO = _IO, diff --git a/lib/ljsyscall/syscall/linux/nl.lua b/lib/ljsyscall/syscall/linux/nl.lua index fad5625ed8..a7da48a831 100644 --- a/lib/ljsyscall/syscall/linux/nl.lua +++ b/lib/ljsyscall/syscall/linux/nl.lua @@ -173,6 +173,10 @@ local rta_decode = { ir.cacheinfo = t.rta_cacheinfo() ffi.copy(ir.cacheinfo, buf, s.rta_cacheinfo) end, + [c.RTA.PREF] = function(ir, buf, len) + local i = pt.uint8(buf) + ir.pref = tonumber(i[0]) + end, -- TODO some missing } @@ -340,6 +344,44 @@ mt.iflink = { end } +meth.ndmsg = { + index = { + family = function(i) return tonumber(i.ndmsg.ndm_family) end, + ifindex = function(i) return tonumber(i.ndmsg.ndm_ifindex) end, + state = function(i) return tonumber(i.ndmsg.ndm_state) end, + flags = function(i) return tonumber(i.ndmsg.ndm_flags) end, + type = function(i) return tonumber(i.ndmsg.ndm_type) end, + dest = function(i) return i.dst or addrtype(i.family) end, + -- might not be set in Lua table, so return nil + dst = function() return nil end, + lladdr = function() return nil end, + }, + flags = { + [c.NTF.PROXY] = "proxy", + [c.NTF.ROUTER] = "router", + }, + state = { + [c.NUD.INCOMPLETE] = "incomplete", + [c.NUD.REACHABLE] = "reachable", + [c.NUD.STALE] = "stale", + [c.NUD.DELAY] = "delay", + [c.NUD.PROBE] = "probe", + [c.NUD.FAILED] = "failed", + [c.NUD.NOARP] = "noarp", + [c.NUD.PERMANENT] = "permanent", + } +} + +mt.ndmsg = { + __index = function(i, k) + if meth.ndmsg.index[k] then return meth.ndmsg.index[k](i) end + end, + __tostring = function(i) -- TODO make more like output of ip route + local s = "dst: " .. tostring(i.dest) .. " lladdr: " .. tostring(i.lladdr) .. " if: " .. i.ifindex + return s + end, +} + meth.rtmsg = { index = { family = function(i) return tonumber(i.rtmsg.rtm_family) end, @@ -378,6 +420,17 @@ mt.rtmsg = { end, } + +mt.neighs = { + __tostring = function(is) + local s = {} + for k, v in ipairs(is) do + s[#s + 1] = tostring(v) + end + return table.concat(s, '\n') + end, +} + meth.routes = { fn = { match = function(rs, addr, len) -- exact match @@ -496,12 +549,12 @@ local function decode_route(buf, len) end local function decode_neigh(buf, len) - local rt = pt.rtmsg(buf) - buf = buf + nlmsg_align(s.rtmsg) - len = len - nlmsg_align(s.rtmsg) + local rt = pt.ndmsg(buf) + buf = buf + nlmsg_align(s.ndmsg) + len = len - nlmsg_align(s.ndmsg) local rtattr = pt.rtattr(buf) - local ir = setmetatable({rtmsg = t.rtmsg()}, mt.rtmsg) - ffi.copy(ir.rtmsg, rt, s.rtmsg) + local ir = setmetatable({ndmsg = t.ndmsg()}, mt.ndmsg) + ffi.copy(ir.ndmsg, rt, s.ndmsg) while rta_ok(rtattr, len) do if nda_decode[rtattr.rta_type] then nda_decode[rtattr.rta_type](ir, buf + rta_length(0), rta_align(rtattr.rta_len) - rta_length(0)) @@ -1051,7 +1104,9 @@ function nl.getneigh(index, tab, ...) if type(index) == 'table' then index = index.index end tab.ifindex = index local ndm = t.ndmsg(tab) - return nlmsg("getneigh", "request, dump", ndm.family, t.ndmsg, ndm, ...) + local n, err = nlmsg("getneigh", "request, dump", ndm.family, t.ndmsg, ndm, ...) + if not n then return nil, err end + return setmetatable(n, mt.neighs) end function nl.newneigh(index, tab, ...) diff --git a/lib/ljsyscall/syscall/linux/ppc/nr.lua b/lib/ljsyscall/syscall/linux/ppc/nr.lua index 010fdb4c24..1d712ebf03 100644 --- a/lib/ljsyscall/syscall/linux/ppc/nr.lua +++ b/lib/ljsyscall/syscall/linux/ppc/nr.lua @@ -354,6 +354,15 @@ local nr = { setns = 350, process_vm_readv = 351, process_vm_writev = 352, + kcmp = 353, + finit_module = 354, + sched_setattr = 355, + sched_getattr = 356, + renameat2 = 357, + seccomp = 358, + getrandom = 359, + memfd_create = 360, + bpf = 361, } } diff --git a/lib/ljsyscall/syscall/linux/ppc64le/nr.lua b/lib/ljsyscall/syscall/linux/ppc64le/nr.lua index bd0df08fc1..0aa6ca6a01 100644 --- a/lib/ljsyscall/syscall/linux/ppc64le/nr.lua +++ b/lib/ljsyscall/syscall/linux/ppc64le/nr.lua @@ -349,6 +349,19 @@ local nr = { kcmp = 354, sched_setattr = 355, sched_getattr = 356, + renameat2 = 357, + seccomp = 358, + getrandom = 359, + memfd_create = 360, + bpf = 361, + execveat = 362, + switch_endian = 363, + userfaultfd = 364, + membarrier = 365, + mlock2 = 378, + copy_file_range = 379, + preadv2 = 380, + pwritev2 = 381, } } diff --git a/lib/ljsyscall/syscall/linux/syscalls.lua b/lib/ljsyscall/syscall/linux/syscalls.lua index 843e9e713e..8766481376 100644 --- a/lib/ljsyscall/syscall/linux/syscalls.lua +++ b/lib/ljsyscall/syscall/linux/syscalls.lua @@ -828,6 +828,172 @@ function S.sysctl(name, new) return old end +-- BPF syscall has a complex semantics with one union serving for all purposes +-- The interface exports both raw syscall and helper functions based on libbpf +if C.bpf then + local function ptr_to_u64(p) return ffi.cast('uint64_t', ffi.cast('void *', p)) end + function S.bpf(cmd, attr) + return C.bpf(c.BPF_CMD[cmd], attr) + end + function S.bpf_prog_load(type, insns, len, license, version, log_level) + if not license then license = "GPL" end -- Must stay alive during the syscall + local bpf_log_buf = ffi.new('char [?]', 64*1024) -- Must stay alive during the syscall + if not version then + -- We have no better way to extract current kernel hex-string other + -- than parsing headers, compiling a helper function or reading /proc + local ver_str, count = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+'), 2 + version = 0 + for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ + version = bit.bor(version, bit.lshift(tonumber(i), 8*count)) + count = count - 1 + end + end + local attr = t.bpf_attr1() + attr[0].prog_type = c.BPF_PROG[type] + attr[0].insns = ptr_to_u64(insns) + attr[0].insn_cnt = len + attr[0].license = ptr_to_u64(license) + attr[0].log_buf = ptr_to_u64(bpf_log_buf) + attr[0].log_size = ffi.sizeof(bpf_log_buf) + attr[0].log_level = log_level or 1 + attr[0].kern_version = version -- MUST match current kernel version + local fd = S.bpf(c.BPF_CMD.PROG_LOAD, attr) + if fd < 0 then + return nil, t.error(errno()), ffi.string(bpf_log_buf) + end + return retfd(fd), ffi.string(bpf_log_buf) + end + function S.bpf_map_create(type, key_size, value_size, max_entries) + local attr = t.bpf_attr1() + attr[0].map_type = c.BPF_MAP[type] + attr[0].key_size = key_size + attr[0].value_size = value_size + attr[0].max_entries = max_entries + local fd = S.bpf(c.BPF_CMD.MAP_CREATE, attr) + if fd < 0 then + return nil, t.error(errno()) + end + return retfd(fd) + end + function S.bpf_map_op(op, fd, key, val_or_next, flags) + local attr = t.bpf_attr1() + attr[0].map_fd = getfd(fd) + attr[0].key = ptr_to_u64(key) + attr[0].value = ptr_to_u64(val_or_next) + attr[0].flags = flags or 0 + local ret = S.bpf(op, attr) + if ret ~= 0 then + return nil, t.error(errno()) + end + return ret + end + function S.bpf_obj_pin(path, fd, flags) + local attr = t.bpf_attr1() + local pathname = ffi.new("char[?]", #path+1) + ffi.copy(pathname, path) + attr[0].pathname = ptr_to_u64(pathname) + attr[0].bpf_fd = getfd(fd) + attr[0].file_flags = flags or 0 + local ret = S.bpf(c.BPF_CMD.OBJ_PIN, attr) + if ret ~= 0 then + return nil, t.error(errno()) + end + return ret + end + function S.bpf_obj_get(path, flags) + local attr = t.bpf_attr1() + local pathname = ffi.new("char[?]", #path+1) + ffi.copy(pathname, path) + attr[0].pathname = ptr_to_u64(pathname) + attr[0].file_flags = flags or 0 + local ret = S.bpf(c.BPF_CMD.OBJ_GET, attr) + if ret < 0 then + return nil, t.error(errno()) + end + return retfd(ret) + end +end + +-- Linux performance monitoring +if C.perf_event_open then + -- Open perf event fd + -- @note see man 2 perf_event_open + -- @return fd, err + function S.perf_event_open(attr, pid, cpu, group_fd, flags) + if attr[0].size == 0 then attr[0].size = ffi.sizeof(attr[0]) end + local fd = C.perf_event_open(attr, pid or 0, cpu or -1, group_fd or -1, c.PERF_FLAG[flags or 0]) + if fd < 0 then + return nil, t.error(errno()) + end + return retfd(fd) + end + -- Read the tracepoint configuration (see "/sys/kernel/debug/tracing/available_events") + -- @param event_path path to tracepoint (e.g. "/sys/kernel/debug/tracing/events/syscalls/sys_enter_write") + -- @return tp, err (e.g. 538, nil) + function S.perf_tracepoint(event_path) + local config = nil + event_path = event_path.."/id" + local fd, err = S.open(event_path, c.O.RDONLY) + if fd then + local ret, err = fd:read(nil, 256) + if ret then + config = tonumber(ret) + end + fd:close() + end + return config, err + end + -- Attach or detach a probe, same semantics as Lua tables. + -- See https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt + -- (When the definition is not nil, it will be created, otherwise it will be detached) + -- @param probe_type either "kprobe" or "uprobe", no other probe types are supported + -- @param name chosen probe name (e.g. "myprobe") + -- @param definition (set to nil to disable probe) (e.g. "do_sys_open $retval") + -- @param retval true/false if this should be entrypoint probe or return probe + -- @return tp, err (e.g. 1099, nil) + function S.perf_probe(probe_type, name, definition, retval) + local event_path = string.format('/sys/kernel/debug/tracing/%s_events', probe_type) + local probe_path = string.format('/sys/kernel/debug/tracing/events/%ss/%s', probe_type, name) + -- Check if probe already exists + if definition and S.statfs(probe_path) then return nil, t.error(c.E.EEXIST) end + local fd, err = S.open(event_path, "wronly, append") + if not fd then return nil, err end + -- Format a probe definition + if not definition then + definition = "-:"..name -- Detach + else + definition = string.format("%s:%s %s", retval and "r" or "p", name, definition) + end + local ok, err = fd:write(definition) + fd:close() + -- Return tracepoint or success + if ok and definition then + return S.perf_tracepoint(probe_path) + end + return ok, err + end + -- Attach perf event reader to tracepoint (see "/sys/kernel/debug/tracing/available_events") + -- @param tp tracepoint identifier (e.g.: 538, use `S.perf_tracepoint()`) + -- @param type perf_attr.sample_type (default: "raw") + -- @param attrs table of attributes (e.g. {sample_type="raw, callchain"}, see `struct perf_event_attr`) + -- @return reader, err + function S.perf_attach_tracepoint(tp, pid, cpu, group_fd, attrs) + local pe = t.perf_event_attr1() + pe[0].type = "tracepoint" + pe[0].config = tp + pe[0].sample_type = "raw" + pe[0].sample_period = 1 + pe[0].wakeup_events = 1 + if attrs then + for k,v in pairs(attrs) do pe[0][k] = v end + end + -- Open perf event reader with given parameters + local fd, err = S.perf_event_open(pe, pid, cpu, group_fd, "fd_cloexec") + if not fd then return nil, err end + return t.perf_reader(fd) + end +end + return S end diff --git a/lib/ljsyscall/syscall/linux/types.lua b/lib/ljsyscall/syscall/linux/types.lua index 2af4ed5112..1ccd6b4356 100644 --- a/lib/ljsyscall/syscall/linux/types.lua +++ b/lib/ljsyscall/syscall/linux/types.lua @@ -115,6 +115,7 @@ local addstructs = { ff_rumble_effect = "struct ff_rumble_effect", ff_effect = "struct ff_effect", sock_fprog = "struct sock_fprog", + bpf_attr = "union bpf_attr", user_cap_header = "struct user_cap_header", user_cap_data = "struct user_cap_data", xt_get_revision = "struct xt_get_revision", @@ -128,6 +129,7 @@ local addstructs = { vhost_vring_addr = "struct vhost_vring_addr", vhost_memory_region = "struct vhost_memory_region", vhost_memory = "struct vhost_memory", + scm_timestamping = "struct scm_timestamping", } for k, v in pairs(addtypes) do addtype(types, k, v) end @@ -136,9 +138,12 @@ for k, v in pairs(addstructs) do addtype(types, k, v, lenmt) end -- these ones not in table as not helpful with vararg or arrays TODO add more addtype variants t.inotify_event = ffi.typeof("struct inotify_event") pt.inotify_event = ptt("struct inotify_event") -- still need pointer to this +pt.perf_event_header = ptt("struct perf_event_header") t.aio_context1 = ffi.typeof("aio_context_t[1]") t.sock_fprog1 = ffi.typeof("struct sock_fprog[1]") +t.bpf_attr1 = ffi.typeof("union bpf_attr[1]") +t.perf_event_attr1 = ffi.typeof("struct perf_event_attr[1]") t.user_cap_data2 = ffi.typeof("struct user_cap_data[2]") @@ -147,6 +152,8 @@ local iocbs = ffi.typeof("struct iocb[?]") t.iocbs = function(n, ...) return ffi.new(iocbs, n, ...) end local sock_filters = ffi.typeof("struct sock_filter[?]") t.sock_filters = function(n, ...) return ffi.new(sock_filters, n, ...) end +local bpf_insns = ffi.typeof("struct bpf_insn[?]") +t.bpf_insns = function(n, ...) return ffi.new(bpf_insns, n, ...) end local iocb_ptrs = ffi.typeof("struct iocb *[?]") t.iocb_ptrs = function(n, ...) return ffi.new(iocb_ptrs, n, ...) end @@ -760,6 +767,14 @@ mt.sock_filter = { addtype(types, "sock_filter", "struct sock_filter", mt.sock_filter) +mt.bpf_insn = { + __new = function(tp, code, dst_reg, src_reg, off, imm) + return ffi.new(tp, c.BPF[code], dst_reg or 0, src_reg or 0, off or 0, imm or 0) + end +} + +addtype(types, "bpf_insn", "struct bpf_insn", mt.bpf_insn) + -- capabilities data is an array so cannot put metatable on it. Also depends on version, so combine into one structure. -- TODO maybe add caching @@ -1231,6 +1246,23 @@ mt.mmsghdrs = { addtype_var(types, "mmsghdrs", "struct {int count; struct mmsghdr msg[?];}", mt.mmsghdrs) +addtype(types, "bpf_attr", "union bpf_attr") + +-- Metatype for Linux perf events +mt.perf_event_attr = { + index = { + type = function(self) return self.pe_type end, + config = function(self) return self.pe_config end, + sample_type = function(self) return self.pe_sample_type end, + }, + newindex = { + type = function(self, v) self.pe_type = c.PERF_TYPE[v] end, + config = function(self, v) self.pe_config = c.PERF_COUNT[v] end, + sample_type = function(self, v) self.pe_sample_type = c.PERF_SAMPLE[v] end, + }, +} +addtype(types, "perf_event_attr", "struct perf_event_attr", mt.perf_event_attr) + -- this is declared above samap_pt = { [c.AF.UNIX] = pt.sockaddr_un, diff --git a/lib/ljsyscall/syscall/linux/util.lua b/lib/ljsyscall/syscall/linux/util.lua index 76a1a86748..a46aeafc8e 100644 --- a/lib/ljsyscall/syscall/linux/util.lua +++ b/lib/ljsyscall/syscall/linux/util.lua @@ -46,8 +46,11 @@ function util.if_nametoindex(name) -- standard function in some libc versions local s, err = S.socket(c.AF.LOCAL, c.SOCK.STREAM, 0) if not s then return nil, err end local i, err = if_nametoindex(name, s) - if not i then return nil, err end - local ok, err = s:close() + if not i then + S.close(s) + return nil, err + end + local ok, err = S.close(s) if not ok then return nil, err end return i end diff --git a/lib/ljsyscall/syscall/linux/x64/nr.lua b/lib/ljsyscall/syscall/linux/x64/nr.lua index 7309565435..0a91a2d2c1 100644 --- a/lib/ljsyscall/syscall/linux/x64/nr.lua +++ b/lib/ljsyscall/syscall/linux/x64/nr.lua @@ -323,6 +323,7 @@ local nr = { getrandom = 318, memfd_create = 319, kexec_file_load = 320, + bpf = 321, } } diff --git a/lib/ljsyscall/syscall/linux/x86/nr.lua b/lib/ljsyscall/syscall/linux/x86/nr.lua index 9757aa2e6a..deb7551239 100644 --- a/lib/ljsyscall/syscall/linux/x86/nr.lua +++ b/lib/ljsyscall/syscall/linux/x86/nr.lua @@ -350,6 +350,7 @@ local nr = { seccomp = 354, getrandom = 355, memfd_create = 356, + bpf = 357, } } diff --git a/lib/ljsyscall/syscall/methods.lua b/lib/ljsyscall/syscall/methods.lua index fb2dcd36f5..51b04e51bf 100644 --- a/lib/ljsyscall/syscall/methods.lua +++ b/lib/ljsyscall/syscall/methods.lua @@ -205,6 +205,89 @@ t.timer = metatype("struct {timer_t timerid[1];}", { --__gc = S.timer_delete, }) +if abi.os == "linux" then + -- Linux performance monitoring reader + t.perf_reader = metatype("struct {int fd; char *map; size_t map_pages; }", { + __new = function (ct, fd) + if not fd then return ffi.new(ct) end + if istype(t.fd, fd) then fd = fd:nogc():getfd() end + return ffi.new(ct, fd) + end, + __len = function(t) return ffi.sizeof(t) end, + __gc = function (t) t:close() end, + __index = { + close = function(t) + t:munmap() + if t.fd > 0 then S.close(t.fd) end + end, + munmap = function (t) + if t.map_pages > 0 then + S.munmap(t.map, (t.map_pages + 1) * S.getpagesize()) + t.map_pages = 0 + end + end, + -- read(2) interface, see `perf_attr.read_format` + -- @return u64 or an array of u64 + read = function (t, len) + local rvals = ffi.new('uint64_t [4]') + local nb, err = S.read(t.fd, rvals, len or ffi.sizeof(rvals)) + if not nb then return nil, err end + return nb == 8 and rvals[0] or rvals + end, + -- mmap(2) interface, see sampling interface (`perf_attr.sample_type` and `perf_attr.mmap`) + -- first page is metadata page, the others are sample_type dependent + mmap = function (t, pages) + t:munmap() + pages = pages or 8 + local map, err = S.mmap(nil, (pages + 1) * S.getpagesize(), "read, write", "shared", t.fd, 0) + if not map then return nil, err end + t.map = map + t.map_pages = pages + return pages + end, + meta = function (t) + return t.map_pages > 0 and ffi.cast("struct perf_event_mmap_page *", t.map) or nil + end, + -- next() function for __ipairs returning (len, event) pairs + -- it only retires read events when current event length is passed + next = function (t, curlen) + local buffer_size = S.getpagesize() * t.map_pages + local base = t.map + S.getpagesize() + local meta = t:meta() + -- Retire last read event or start iterating + if curlen then + meta.data_tail = meta.data_tail + curlen + end + -- End of ring buffer, yield + -- TODO: + if meta.data_head == meta.data_tail then + return + end + local e = pt.perf_event_header(base + (meta.data_tail % buffer_size)) + local e_end = base + (meta.data_tail + e.size) % buffer_size; + -- If the perf event wraps around the ring, we need to make a contiguous copy + if ffi.cast("uintptr_t", e_end) < ffi.cast("uintptr_t", e) then + local tmp_e = ffi.new("char [?]", e.size) + local len = (base + buffer_size) - ffi.cast('char *', e) + ffi.copy(tmp_e, e, len) + ffi.copy(tmp_e + len, base, e.size - len) + e = ffi.cast(ffi.typeof(e), tmp_e) + end + return e.size, e + end, + -- Various ioctl() wrappers + ioctl = function(t, cmd, val) return S.ioctl(t.fd, cmd, val or 0) end, + start = function(t) return t:ioctl("PERF_EVENT_IOC_ENABLE") end, + stop = function(t) return t:ioctl("PERF_EVENT_IOC_DISABLE") end, + refresh = function(t) return t:ioctl("PERF_EVENT_IOC_REFRESH") end, + reset = function(t) return t:ioctl("PERF_EVENT_IOC_RESET") end, + setfilter = function(t, val) return t:ioctl("PERF_EVENT_IOC_SET_FILTER", val) end, + setbpf = function(t, fd) return t:ioctl("PERF_EVENT_IOC_SET_BPF", pt.void(fd)) end, + }, + __ipairs = function(t) return t.next, t, nil end + }) +end + -- TODO reinstate this, more like fd is, hence changes to destroy --[[ t.aio_context = metatype("struct {aio_context_t ctx;}", { diff --git a/lib/ljsyscall/syscall/netbsd/ffifunctions.lua b/lib/ljsyscall/syscall/netbsd/ffifunctions.lua index 2a6741b425..25b32278ca 100644 --- a/lib/ljsyscall/syscall/netbsd/ffifunctions.lua +++ b/lib/ljsyscall/syscall/netbsd/ffifunctions.lua @@ -78,5 +78,7 @@ int __nanosleep50(const struct timespec *req, struct timespec *rem); int __timer_settime50(timer_t timerid, int flags, const struct itimerspec *new_value, struct itimerspec * old_value); int __timer_gettime50(timer_t timerid, struct itimerspec *curr_value); int __adjtime50(const struct timeval *delta, struct timeval *olddelta); + +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); ]] diff --git a/lib/ljsyscall/syscall/openbsd/ffi.lua b/lib/ljsyscall/syscall/openbsd/ffi.lua index d89b277f27..a53fc2b5b2 100644 --- a/lib/ljsyscall/syscall/openbsd/ffi.lua +++ b/lib/ljsyscall/syscall/openbsd/ffi.lua @@ -295,6 +295,7 @@ struct sigaction { append [[ int reboot(int howto); int ioctl(int d, unsigned long request, void *arg); +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); /* not syscalls, but using for now */ int grantpt(int fildes); diff --git a/lib/ljsyscall/syscall/osx/c.lua b/lib/ljsyscall/syscall/osx/c.lua index 08d6a0339e..82d077f53b 100644 --- a/lib/ljsyscall/syscall/osx/c.lua +++ b/lib/ljsyscall/syscall/osx/c.lua @@ -42,6 +42,7 @@ local C = setmetatable({}, { }) -- new stat structure, else get legacy one; could use syscalls instead +-- does not work for fstatat C.stat = C.stat64 C.fstat = C.fstat64 C.lstat = C.lstat64 @@ -56,7 +57,7 @@ function C.getdirentries(fd, buf, len, basep) end ]] --- cannot find these anywhere! +-- cannot find these anywhere! Apparently not there since 64 bit inodes? --C.getdirentries = ffi.C._getdirentries --C.sigaction = ffi.C._sigaction diff --git a/lib/ljsyscall/syscall/osx/constants.lua b/lib/ljsyscall/syscall/osx/constants.lua index 9a6ec0714d..40108a2003 100644 --- a/lib/ljsyscall/syscall/osx/constants.lua +++ b/lib/ljsyscall/syscall/osx/constants.lua @@ -1109,5 +1109,27 @@ c.CLOCKTYPE = { c.CLOCKTYPE.REALTIME = c.CLOCKTYPE.SYSTEM -return c +c.CLOCK = strflag { + REALTIME = 0, + MONOTONIC_RAW = 4, + MONOTONIC_RAW_APPROX = 5, + MONOTONIC = 6, + UPTIME_RAW = 8, + UPTIME_RAW_APPROX = 9, + PROCESS_CPUTIME_ID = 12, + THREAD_CPUTIME_ID = 16, +} + +-- AT constants only in recent versions, should check when added +c.AT_FDCWD = atflag { + FDCWD = -2, +} +c.AT = multiflags { + EACCESS = 0x0010, + SYMLINK_NOFOLLOW = 0x0020, + SYMLINK_FOLLOW = 0x0040, + REMOVEDIR = 0x0080, +} + +return c diff --git a/lib/ljsyscall/syscall/osx/ffi.lua b/lib/ljsyscall/syscall/osx/ffi.lua index ae6fedc1b0..831e5621bf 100644 --- a/lib/ljsyscall/syscall/osx/ffi.lua +++ b/lib/ljsyscall/syscall/osx/ffi.lua @@ -30,7 +30,8 @@ typedef int64_t blkcnt_t; typedef int32_t blksize_t; typedef int32_t suseconds_t; typedef uint16_t nlink_t; -typedef uint64_t ino_t; // at least on recent desktop; TODO define as ino64_t +typedef uint64_t ino64_t; +typedef uint32_t ino_t; typedef long time_t; typedef int32_t daddr_t; typedef unsigned long clock_t; @@ -158,7 +159,7 @@ struct stat { dev_t st_dev; mode_t st_mode; nlink_t st_nlink; - ino_t st_ino; + ino64_t st_ino; uid_t st_uid; gid_t st_gid; dev_t st_rdev; @@ -174,6 +175,25 @@ struct stat { int32_t st_lspare; int64_t st_qspare[2]; }; +struct stat32 { + dev_t st_dev; + ino_t st_ino; + mode_t st_mode; + nlink_t st_nlink; + uid_t st_uid; + gid_t st_gid; + dev_t st_rdev; + struct timespec st_atimespec; + struct timespec st_mtimespec; + struct timespec st_ctimespec; + off_t st_size; + blkcnt_t st_blocks; + blksize_t st_blksize; + uint32_t st_flags; + uint32_t st_gen; + int32_t st_lspare; + int64_t st_qspare[2]; +}; union sigval { int sival_int; void *sival_ptr; @@ -292,6 +312,7 @@ int mount(const char *type, const char *dir, int flags, void *data); int stat64(const char *path, struct stat *sb); int lstat64(const char *path, struct stat *sb); int fstat64(int fd, struct stat *sb); +int fstatat(int dirfd, const char *pathname, struct stat32 *buf, int flags); int _getdirentries(int fd, char *buf, int nbytes, long *basep); int _sigaction(int signum, const struct sigaction *act, struct sigaction *oldact); diff --git a/lib/ljsyscall/syscall/osx/syscalls.lua b/lib/ljsyscall/syscall/osx/syscalls.lua index 47d7918db7..67949c1905 100644 --- a/lib/ljsyscall/syscall/osx/syscalls.lua +++ b/lib/ljsyscall/syscall/osx/syscalls.lua @@ -53,6 +53,14 @@ function S.clock_get_time(clock_serv, cur_time) return cur_time end +-- cannot find out how to get new stat type from fstatat +function S.fstatat(fd, path, buf, flags) + if not buf then buf = t.stat32() end + local ret, err = C.fstatat(c.AT_FDCWD[fd], path, buf, c.AT[flags]) + if ret == -1 then return nil, t.error(err or errno()) end + return buf +end + return S end diff --git a/lib/ljsyscall/syscall/osx/types.lua b/lib/ljsyscall/syscall/osx/types.lua index 4b3304cde0..204ad5aecd 100644 --- a/lib/ljsyscall/syscall/osx/types.lua +++ b/lib/ljsyscall/syscall/osx/types.lua @@ -120,6 +120,9 @@ end addtype(types, "stat", "struct stat", mt.stat) +-- for fstatat where we can'tseem to get 64 bit version at present +addtype(types, "stat32", "struct stat32", mt.stat) + local signames = {} local duplicates = {LWT = true, IOT = true, CLD = true, POLL = true} for k, v in pairs(c.SIG) do diff --git a/lib/ljsyscall/syscall/syscalls.lua b/lib/ljsyscall/syscall/syscalls.lua index 33051be18a..c6d3417028 100644 --- a/lib/ljsyscall/syscall/syscalls.lua +++ b/lib/ljsyscall/syscall/syscalls.lua @@ -327,7 +327,9 @@ function S.getsockopt(fd, level, optname, optval, optlen) local ret, err = C.getsockopt(getfd(fd), c.SOL[level], c.SO[optname], optval, len) if ret == -1 then return nil, t.error(err or errno()) end if len[0] ~= optlen then error("incorrect optlen for getsockopt: set " .. optlen .. " got " .. len[0]) end - return optval[0] -- TODO will not work if struct, eg see netfilter + local ok, ret = pcall(function () return optval[0] end) + if ok then return ret + else return optval end end function S.bind(sockfd, addr, addrlen) local saddr = pt.sockaddr(addr) @@ -428,6 +430,7 @@ function S.getpid() return C.getpid() end function S.getppid() return C.getppid() end function S.getgid() return C.getgid() end function S.getegid() return C.getegid() end +function S.gettid() return C.gettid() end function S.setuid(uid) return retbool(C.setuid(uid)) end function S.setgid(gid) return retbool(C.setgid(gid)) end function S.seteuid(uid) return retbool(C.seteuid(uid)) end diff --git a/lib/ljsyscall/test/bsd.lua b/lib/ljsyscall/test/bsd.lua index 70cba01b39..5fcb4a3b74 100644 --- a/lib/ljsyscall/test/bsd.lua +++ b/lib/ljsyscall/test/bsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi @@ -89,7 +89,9 @@ test.filesystem_bsd = { test_chflags = function() local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.chflags(tmpfile, "uf_append")) + local ok, err = S.chflags(tmpfile, "uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -102,7 +104,9 @@ test.filesystem_bsd = { if not S.lchflags then error "skipped" end local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.lchflags(tmpfile, "uf_append")) + local ok, err = S.lchflags(tmpfile, "uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -114,7 +118,9 @@ test.filesystem_bsd = { test_fchflags = function() local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(fd:chflags("uf_append")) + local ok, err = fd:chflags("uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -127,7 +133,9 @@ test.filesystem_bsd = { if not S.chflagsat then error "skipped" end local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.chflagsat("fdcwd", tmpfile, "uf_append", "symlink_nofollow")) + local ok, err = S.chflagsat("fdcwd", tmpfile, "uf_append", "symlink_nofollow") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -258,7 +266,8 @@ test.bsd_extattr = { assert(S.unlink(tmpfile)) local n, err = fd:extattr_get("user", "myattr", false) -- false does raw call with no buffer to return length if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support extattr - assert(not n and err.NOATTR) + assert(not n, "expected to fail") + assert(err.NOATTR, err) assert(fd:close()) end, test_extattr_getsetdel_fd = function() @@ -267,7 +276,8 @@ test.bsd_extattr = { assert(S.unlink(tmpfile)) local n, err = fd:extattr_get("user", "myattr", false) -- false does raw call with no buffer to return length if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support extattr - assert(not n and err.NOATTR) + assert(not n, "expected to fail") + assert(err.NOATTR, err) local n, err = fd:extattr_set("user", "myattr", "myvalue") if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support setting extattr assert(n, err) diff --git a/lib/ljsyscall/test/ctest-linux.lua b/lib/ljsyscall/test/ctest-linux.lua index 4433013fcd..a531542231 100644 --- a/lib/ljsyscall/test/ctest-linux.lua +++ b/lib/ljsyscall/test/ctest-linux.lua @@ -66,6 +66,81 @@ ctypes["struct termios"] = nil -- not defined by glibc ctypes["struct k_sigaction"] = nil +-- eBPF not available on Travis / opaque types +ctypes["struct bpf_insn"] = nil +ctypes["union bpf_attr"] = nil +c.BPF_MAP = {} +c.BPF_CMD = {} +c.BPF_PROG = {} +c.BPF_ATTACH_TYPE = {} +c.BPF.ALU64 = nil +c.BPF.DW = nil +c.BPF.JSGT = nil +c.BPF.JSGE = nil +c.BPF.CALL = nil +c.BPF.EXIT = nil +c.BPF.TO_LE = nil +c.BPF.TO_BE = nil +c.BPF.ANY = nil +c.BPF.NOEXIST = nil +c.BPF.EXIST = nil +c.BPF.END = nil +c.BPF.ARSH = nil +c.BPF.XADD = nil +c.BPF.JNE = nil +c.BPF.MOV = nil +c.SYS.bpf = nil + +-- no perf_event_open on Travis CI +ctypes["struct perf_event_attr"] = nil +ctypes["struct perf_event_reader"] = nil +ctypes["struct perf_event_header"] = nil +ctypes["struct perf_event_mmap_page"] = nil +c.PERF_TYPE = {} +c.PERF_COUNT = {} +c.PERF_SAMPLE = {} +c.PERF_FLAG = {} +c.PERF_SAMPLE_REGS = {} +c.PERF_SAMPLE_BRANCH = {} +c.PERF_READ_FORMAT = {} +c.PERF_RECORD = {} +-- no perf_event_open ioctls on Travis CI +c.IOCTL.PERF_EVENT_IOC_ENABLE = nil +c.IOCTL.PERF_EVENT_IOC_DISABLE = nil +c.IOCTL.PERF_EVENT_IOC_REFRESH = nil +c.IOCTL.PERF_EVENT_IOC_RESET = nil +c.IOCTL.PERF_EVENT_IOC_PERIOD = nil +c.IOCTL.PERF_EVENT_IOC_SET_OUTPUT = nil +c.IOCTL.PERF_EVENT_IOC_SET_FILTER = nil +c.IOCTL.PERF_EVENT_IOC_ID = nil +c.IOCTL.PERF_EVENT_IOC_SET_BPF = nil + +-- not in kernel headers used by Travis CI +ctypes["struct scm_timestamping"] = nil +c.SCM.TSTAMP_ACK = nil +c.SCM.TSTAMP_SCHED = nil +c.SCM.TSTAMP_SND = nil +c.SCM.TIMESTAMPING_OPT_STATS = nil + +-- not in kernel headers used by Travis CI +c.SOF.TIMESTAMPING_LAST = nil +c.SOF.TIMESTAMPING_MASK = nil +c.SOF.TIMESTAMPING_OPT_CMSG = nil +c.SOF.TIMESTAMPING_OPT_ID = nil +c.SOF.TIMESTAMPING_OPT_PKTINFO = nil +c.SOF.TIMESTAMPING_OPT_STATS = nil +c.SOF.TIMESTAMPING_OPT_TSONLY = nil +c.SOF.TIMESTAMPING_OPT_TX_SWHW = nil +c.SOF.TIMESTAMPING_RAW_HARDWARE = nil +c.SOF.TIMESTAMPING_RX_HARDWARE = nil +c.SOF.TIMESTAMPING_RX_SOFTWARE = nil +c.SOF.TIMESTAMPING_SOFTWARE = nil +c.SOF.TIMESTAMPING_SYS_HARDWARE = nil +c.SOF.TIMESTAMPING_TX_ACK = nil +c.SOF.TIMESTAMPING_TX_HARDWARE = nil +c.SOF.TIMESTAMPING_TX_SCHED = nil +c.SOF.TIMESTAMPING_TX_SOFTWARE = nil + if abi.arch == "arm" then ctypes["struct statfs64"] = nil end -- padding difference, not that important for k, v in pairs(c.IOCTL) do if type(v) == "table" then c.IOCTL[k] = v.number end end @@ -223,6 +298,10 @@ c.TCP.QUEUE_SEQ = nil c.TCP.TIMESTAMP = nil c.TCP.USER_TIMEOUT = nil c.TCP.REPAIR_QUEUE = nil +c.RTA.NEWDST = nil +c.RTA.PREF = nil +c.RTA.VIA = nil +c.RTA.MFC_STATS = nil -- these are not in Musl at present TODO send patches to get them in c.IPPROTO.UDPLITE = nil @@ -279,6 +358,23 @@ c.SO.PEEK_OFF = nil c.SO.GET_FILTER = nil c.SO.NOFCS = nil c.SO.WIFI_STATUS = nil +c.SO.REUSEPORT = nil +c.SO.LOCK_FILTER = nil +c.SO.SELECT_ERR_QUEUE = nil +c.SO.BUSY_POLL = nil +c.SO.MAX_PACING_RATE = nil +c.SO.BPF_EXTENSIONS = nil +c.SO.INCOMING_CPU = nil +c.SO.ATTACH_BPF = nil +c.SO.DETACH_BPF = nil +c.SO.ATTACH_REUSEPORT_CBPF = nil +c.SO.ATTACH_REUSEPORT_EBPF = nil + +-- new fcntl +c.F.CANCELLK = nil +c.F.ADD_SEALS = nil +c.F.GET_SEALS = nil +c.F_SEAL = nil -- Musl changes some of the syscall constants in its 32/64 bit handling c.SYS.getdents = nil @@ -318,6 +414,8 @@ c.CBAUDEX = nil -- missing on my mips box c.AUDIT_ARCH.H8300 = nil +-- missing on CI +c.AUDIT_ARCH.AARCH64 = nil -- defined only in linux/termios.h which we cannot include on mips c.TIOCM.OUT1 = nil diff --git a/lib/ljsyscall/test/freebsd.lua b/lib/ljsyscall/test/freebsd.lua index 4a57af07ad..e296017f4b 100644 --- a/lib/ljsyscall/test/freebsd.lua +++ b/lib/ljsyscall/test/freebsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/helpers.lua b/lib/ljsyscall/test/helpers.lua new file mode 100644 index 0000000000..6f29c0a1d3 --- /dev/null +++ b/lib/ljsyscall/test/helpers.lua @@ -0,0 +1,81 @@ +-- misc helper functions + +local require, error, assert, tonumber, tostring, +setmetatable, pairs, ipairs, unpack, rawget, rawset, +pcall, type, table, string, math = +require, error, assert, tonumber, tostring, +setmetatable, pairs, ipairs, unpack, rawget, rawset, +pcall, type, table, string, math + +local debug, collectgarbage = require "debug", collectgarbage + +local ffi = require "ffi" +local bit = require "bit" + +local h = {} + +-- generic assert helper, mainly for tests +function h.assert(cond, err, ...) + if not cond then + error(tostring(err or "unspecified error")) -- annoyingly, assert does not call tostring! + end + collectgarbage("collect") -- force gc, to test for bugs + if type(cond) == "function" then return cond, err, ... end + if cond == true then return ... end + return cond, ... +end + +-- endian conversion +if ffi.abi("be") then -- nothing to do + function h.htonl(b) return b end + function h.htons(b) return b end + function h.convle32(b) return bit.bswap(b) end -- used by file system capabilities, always stored as le +else + function h.htonl(b) return bit.bswap(b) end + function h.htons(b) return bit.rshift(bit.bswap(b), 16) end + function h.convle32(b) return b end -- used by file system capabilities, always stored as le +end +h.ntohl = h.htonl -- reverse is the same +h.ntohs = h.htons -- reverse is the same + +function h.octal(s) return tonumber(s, 8) end +local octal = h.octal + +function h.split(delimiter, text) + if delimiter == "" then return {text} end + if #text == 0 then return {} end + local list = {} + local pos = 1 + while true do + local first, last = text:find(delimiter, pos) + if first then + list[#list + 1] = text:sub(pos, first - 1) + pos = last + 1 + else + list[#list + 1] = text:sub(pos) + break + end + end + return list +end + +function h.trim(s) -- TODO should replace underscore with space + return (s:gsub("^%s*(.-)%s*$", "%1")) +end + +local split, trim = h.split, h.trim + +h.divmod = function(a, b) + return math.floor(a / b), a % b +end + +h.booltoc = setmetatable({ + [0] = 0, + [1] = 1, + [false] = 0, + [true] = 1, +}, {__call = function(tb, arg) return tb[arg or 0] end}) -- allow nil as false + +function h.ctobool(i) return tonumber(i) ~= 0 end + +return h diff --git a/lib/ljsyscall/test/linux-constants.lua b/lib/ljsyscall/test/linux-constants.lua index cb2de6e968..b25947f639 100644 --- a/lib/ljsyscall/test/linux-constants.lua +++ b/lib/ljsyscall/test/linux-constants.lua @@ -155,6 +155,21 @@ local function fixup_constants(abi, c) c.SECCOMP_MODE = nil c.SECCOMP_RET = nil c.MFD = nil + c.RTA.NEWDST = nil + c.RTA.PREF = nil + c.RTA.VIA = nil + c.RTA.MFC_STATS = nil + c.AUDIT_ARCH.AARCH64 = nil + c.SO.MAX_PACING_RATE = nil + c.SO.BPF_EXTENSIONS = nil + c.SO.INCOMING_CPU = nil + c.SO.ATTACH_BPF = nil + c.SO.DETACH_BPF = nil + c.SO.ATTACH_REUSEPORT_CBPF = nil + c.SO.ATTACH_REUSEPORT_EBPF = nil + c.F_SEAL = nil + c.F.ADD_SEALS = nil + c.F.GET_SEALS = nil -- these are not even in linux git head headers or names wrong c.O.ASYNC = nil @@ -202,9 +217,58 @@ local function fixup_constants(abi, c) c.SYS.getrandom = nil c.SYS.memfd_create = nil c.SYS.kexec_file_load = nil + c.SYS.bpf = nil -- new constants c.GRND = nil + -- requires Linux 3.19+, not supported on Travis + c.BPF_MAP = {} + c.BPF_CMD = {} + c.BPF_PROG = {} + c.BPF_ATTACH_TYPE = {} + c.BPF.ALU64 = nil + c.BPF.DW = nil + c.BPF.JSGT = nil + c.BPF.JSGE = nil + c.BPF.CALL = nil + c.BPF.EXIT = nil + c.BPF.TO_LE = nil + c.BPF.TO_BE = nil + c.BPF.END = nil + c.BPF.ARSH = nil + c.BPF.XADD = nil + c.BPF.JNE = nil + c.BPF.MOV = nil + c.BPF.ANY = nil + c.BPF.EXIST = nil + c.BPF.NOEXIST = nil + -- no perf_event_open on Travis CI + c.PERF_TYPE = {} + c.PERF_COUNT = {} + c.PERF_SAMPLE = {} + c.PERF_FLAG = {} + c.PERF_SAMPLE_REGS = {} + c.PERF_SAMPLE_BRANCH = {} + c.PERF_READ_FORMAT = {} + c.PERF_RECORD = {} + + c.SOF.TIMESTAMPING_LAST = nil + c.SOF.TIMESTAMPING_MASK = nil + c.SOF.TIMESTAMPING_OPT_CMSG = nil + c.SOF.TIMESTAMPING_OPT_ID = nil + c.SOF.TIMESTAMPING_OPT_PKTINFO = nil + c.SOF.TIMESTAMPING_OPT_STATS = nil + c.SOF.TIMESTAMPING_OPT_TSONLY = nil + c.SOF.TIMESTAMPING_OPT_TX_SWHW = nil + c.SOF.TIMESTAMPING_RAW_HARDWARE = nil + c.SOF.TIMESTAMPING_RX_HARDWARE = nil + c.SOF.TIMESTAMPING_RX_SOFTWARE = nil + c.SOF.TIMESTAMPING_SOFTWARE = nil + c.SOF.TIMESTAMPING_SYS_HARDWARE = nil + c.SOF.TIMESTAMPING_TX_ACK = nil + c.SOF.TIMESTAMPING_TX_HARDWARE = nil + c.SOF.TIMESTAMPING_TX_SCHED = nil + c.SOF.TIMESTAMPING_TX_SOFTWARE = nil return c end diff --git a/lib/ljsyscall/test/linux-structures.lua b/lib/ljsyscall/test/linux-structures.lua index 811d723dcb..0ab4a68a6c 100644 --- a/lib/ljsyscall/test/linux-structures.lua +++ b/lib/ljsyscall/test/linux-structures.lua @@ -28,6 +28,7 @@ local function fixup_structs(abi, ctypes) ctypes["struct capabilities"] = nil ctypes["struct cap"] = nil ctypes["struct {dev_t dev;}"] = nil + ctypes["struct perf_event_reader"] = nil -- standard headers use __kernel types for these or just fixed sizes ctypes.ino_t = nil @@ -68,7 +69,12 @@ local function fixup_structs(abi, ctypes) ctypes["struct sockaddr_storage"] = nil -- uses __kernel_ ctypes["struct k_sigaction"] = nil -- seems to be incorrect in headers ctypes["struct mmsghdr"] = nil -- too new for our headers - + ctypes["union bpf_attr"] = nil -- too new for our headers + ctypes["struct bpf_insn"] = nil -- too new for our headers + ctypes["struct perf_event_attr"] = nil -- too new for our headers + ctypes["struct perf_event_header"] = nil -- too new for our headers + ctypes["struct perf_event_mmap_page"] = nil -- too new for our headers + ctypes["struct scm_timestamping"] = nil -- too new for our headers ctypes["sigset_t"] = nil -- still some issues return ctypes diff --git a/lib/ljsyscall/test/linux.lua b/lib/ljsyscall/test/linux.lua index 40e4d48506..953e5ea859 100644 --- a/lib/ljsyscall/test/linux.lua +++ b/lib/ljsyscall/test/linux.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local abi = S.abi local types = S.types local c = S.c @@ -269,7 +269,9 @@ test.misc_linux = { end, ]] test_adjtimex = function() - local tt = assert(S.adjtimex()) + local tt, err = S.adjtimex() + if not tt and err.PERM then error "skipped" end + assert(tt, err) end, test_prctl = function() local n @@ -346,9 +348,12 @@ test.misc_linux = { end, test_memfd = function() if not S.memfd_create then error "skipped" end - local fd, err = S.memfd_create("", "cloexec") + local fd, err = S.memfd_create("", "cloexec, allow_sealing") if not fd and err.NOSYS then error "skipped" end assert(fd, err) + local seals = assert(fd:fcntl("get_seals")) + assert(seals == 0) + assert(fd:fcntl("add_seals", "shrink, grow, write, seal")) assert(fd:close()) end, } @@ -431,10 +436,6 @@ test.netlink = { end, test_getlink = function() local i = assert(nl.getlink()) - local st, err = S.stat("/sys/class/net") -- just in case sysfs not mounted - if not st then error "skipped" end - local df = assert(util.dirtable("/sys/class/net", true)) - assert_equal(#df, #i, "expect same number of interfaces as /sys/class/net") assert(i.lo, "expect a loopback interface") local lo = i.lo assert(lo.flags.up, "loopback interface should be up") @@ -486,7 +487,9 @@ test.netlink = { test_interfaces = function() local i = assert(nl.interfaces()) assert_equal(tostring(i.lo.inet[1].addr), "127.0.0.1", "loopback ipv4 on lo") - assert_equal(tostring(i.lo.inet6[1].addr), "::1", "loopback ipv6 on lo") + if i.lo.inet6[1] then + assert_equal(tostring(i.lo.inet6[1].addr), "::1", "loopback ipv6 on lo") + end end, test_newlink_flags_root = function() local p = assert(S.clone()) @@ -566,12 +569,6 @@ test.netlink = { assert(i.dummy0:down()) assert(i.dummy0:delete()) end, - test_interface_set_macaddr_fail = function() - local i = assert(nl.interfaces()) - assert(i.lo, "expect to find lo") - local ok, err = nl.newlink(i.lo.index, 0, 0, 0, "address", "46:9d:c9:06:dd:dd") - assert(not ok and err and (err.PERM or err.OPNOTSUPP), "should not be able to change macaddr on lo") - end, test_newlink_error_root = function() local ok, err = nl.newlink(-1, 0, "up", "up") assert(not ok, "expect bogus newlink to fail") @@ -670,6 +667,7 @@ test.netlink = { test_getroute_inet6 = function() local r = assert(nl.routes("inet6", "unspec")) local nr = r:match("::1/128") + if #nr == 0 then error "skipped" end -- no ipv6 support assert(#nr >= 1, "expect at least one matched route") -- one of my machines has two local lor = nr[1] assert_equal(tostring(lor.source), "::", "expect empty source route") @@ -766,6 +764,10 @@ test.netlink = { assert_equal(#n, 1) assert_equal(tostring(n[1].lladdr), "46:9d:c9:06:dd:dd") assert_equal(tostring(n[1].dst), "10.0.0.2") + assert_equal(tostring(n[1].dest), "10.0.0.2") + assert_equal(n[1].ifindex, i.dummy0.index) + assert_equal(n[1].state, c.NUD.PERMANENT) + assert_equal(n[1].flags, 0) assert(nl.delneigh(i.dummy0, {family = "inet"}, "dst", "10.0.0.2", "lladdr", "46:9d:c9:06:dd:dd")) assert(i.dummy0:delete()) end, @@ -1315,9 +1317,152 @@ test.bpf = { end, } +-- test eBPF filters +if S.bpf and not S.__rump then + test.bpf_root = {} + test.bpf_root.test_bpf_map_create = function() + local bpf = t.sock_filters(1, { + t.sock_filter("RET,K", 0) + }) + -- Update + local key, klen = ffi.new('int [1]', 0xdead), ffi.sizeof('int') + local fd, err = assert(S.bpf_map_create(c.BPF_MAP.HASH, klen, klen, 10)) + assert(S.bpf_map_op(c.BPF_CMD.MAP_UPDATE_ELEM, fd, key, key) == 0) + -- Retrieve + local val = ffi.new('int [1]', 0xbeef) + local ok, err = S.bpf_map_op(c.BPF_CMD.MAP_LOOKUP_ELEM, fd, key, val) + assert(ok and key[0] == val[0]) + S.close(fd) + end + test.bpf_root.test_bpf_prog_load = function() + local bpf = t.bpf_insns(2, { + t.bpf_insn("ALU64,MOV,K", 0, 0, 0, 1), + t.bpf_insn("JMP,EXIT"), + }) + local fd, err, log = S.bpf_prog_load(c.BPF_PROG.SOCKET_FILTER, bpf, 2) + if not fd then assert(false, err..': '..log) end + S.close(fd) + end +end + +-- test perf_event_open +if S.perf_event_open and not S.__rump then + test.perf_root = {} + test.perf_root.test_perf_open = function () + -- Create perf event attribute with dummy config + local pe = t.perf_event_attr1() + pe[0].type = "software" + pe[0].config = "sw_dummy" + pe[0].disabled = 1 + pe[0].exclude_kernel = 1 + pe[0].exclude_hv = 1 + -- Open event and read a dummy value + local fd = S.perf_event_open(pe) + fd:ioctl("PERF_EVENT_IOC_ENABLE", 0) + local count = t.buffer(ffi.sizeof('int64_t')) + local rb = fd:read(count, ffi.sizeof(count)) + fd:ioctl("PERF_EVENT_IOC_DISABLE", 0) + fd:close() + -- Check just the size of read count + assert(rb == ffi.sizeof(count)) + end + test.perf_root.test_perf_sw = function () + -- Read out a software perf counter + local pe = t.perf_event_attr1() + pe[0].type = "software" + pe[0].config = "sw_cpu_clock" + pe[0].exclude_kernel = 1 + pe[0].exclude_hv = 1 + -- Open event and read a dummy value + -- @note perf event fd has CLO_EXEC, must not fork + local reader = t.perf_reader(S.perf_event_open(pe)) + reader:start() + local ticks = reader:read() + reader:close() + -- Check just the size of read count + assert(ticks > 0) + end + test.perf_root.test_perf_attach = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + -- Get tracepoint id + local tp = assert(S.perf_tracepoint("/sys/kernel/debug/tracing/events/syscalls/sys_enter_getcwd")) + local reader = S.perf_attach_tracepoint(tp) + -- Trace getcwd() syscall + reader:start() + S.getcwd() + S.getcwd() + local cnt = reader:read() + reader:stop() + reader:close() + -- Check value + assert(cnt == 2) + end + test.perf_root.test_perf_sampling = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + local sample_t = ffi.typeof [[ + struct { + struct perf_event_header header; + uint32_t size; + struct { + uint16_t id; + uint8_t flags; + uint8_t preempt_count; + int pid; + }; + uint64_t ip; + } * + ]] + -- Get tracepoint id + local tp = assert(S.perf_tracepoint("/sys/kernel/debug/tracing/events/syscalls/sys_enter_getcwd")) + local reader = S.perf_attach_tracepoint(tp) + -- Trace getcwd() syscall + reader:mmap() + reader:start() + for i = 1,10 do S.getcwd() end + reader:stop() + -- Read samples from mmap + local cnt = 0; + for len,e in ipairs(reader) do + if e.type ~= c.PERF_RECORD.SAMPLE then break end + -- Check if we're the caller + e = ffi.cast(sample_t, e) + if e.pid == S.getpid() then + cnt = cnt + 1 + end + end + reader:close() + -- Check if we got all samples + assert(cnt == 10) + end + test.perf_root.test_perf_kprobe = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + -- Attach a kprobe to open() + local tp = assert(S.perf_probe("kprobe", "myprobe", "do_sys_open $retval", true)) + local reader = S.perf_attach_tracepoint(tp) + reader:start() + S.open("/tmp", "rdonly") + local cnt = reader:read() + reader:stop() + reader:close() + -- Detach probe + S.perf_probe("kprobe", "myprobe", false) + -- See if we hit the probe + assert(cnt == 1) + end +end + -- TODO remove arch tests. Unclear if my ppc/arm does not support or a bug, retest later with newer kernel -- still ppc issues with 3.12.6 ppc, need to debug more, and mips issues -if not (abi.arch == "ppc64le" or abi.arch == "ppc" or abi.arch == "arm" or abi.arch == "mips" or S.__rump) then -- cannot test on rump as uses clone() +if not (abi.arch == "ppc64le" or abi.arch == "ppc" or abi.arch == "mips" or S.__rump) then -- cannot test on rump as uses clone() test.seccomp = { test_no_new_privs = function() -- this must be done for non root to call type 2 seccomp local p = assert(S.clone()) @@ -1481,7 +1626,7 @@ test.seccomp = { local pid = S.getpid() local ofd, err = S.open("/dev/null", "rdonly") -- not allowed fork_assert(not ofd, "should not run open") - fork_assert(err.errno == nr.SYS.open, "syscall that did not work should be open") + fork_assert(err.errno == nr.SYS.open or err.errno == nr.SYS.openat, "syscall that did not work should be open[at]") local pid = S.getpid() S._exit() else @@ -1764,7 +1909,7 @@ test.processes_linux = { fork_assert(S.getppid() == pid0, "parent pid should be previous pid") S.exit(23) else -- parent - local infop, rusage = assert(S.waitid("all", 0, "exited, stopped, continued")) + local infop, rusage = assert(S.waitid("pid", pid, "exited, stopped, continued")) assert_equal(infop.signo, c.SIG.CHLD, "waitid to return SIGCHLD") assert_equal(infop.status, 23, "exit should be 23") assert_equal(infop.code, c.SIGCLD.EXITED, "normal exit expected") @@ -1784,6 +1929,9 @@ test.processes_linux = { assert(status.EXITSTATUS == 23, "exit should be 23") end end, + test_tid = function() + assert(S.getpid() == S.gettid(), "PID should be the same as TID") + end, } test.scheduler = { test_getcpu = function() @@ -1850,13 +1998,11 @@ test.swap = { assert_equal(c.SWAP_FLAG["23, discard"], c.SWAP_FLAG["prefer, discard"] + bit.lshift(23, c.SWAP_FLAG["prio_shift"])) end, test_swap_fail = function() - local ex = "PERM" -- EPERM if not root - if S.geteuid() == 0 then ex = "INVAL" end local ok, err = S.swapon("/dev/null", "23, discard") if not ok and err.NOSYS then return end -- Android does not implement swap, so skip test - assert(not ok and err[ex], "should not create swap on /dev/null") + assert(not ok and (err.PERM or err.INVAL), "should not create swap on /dev/null") local ok, err = S.swapoff("/dev/null") - assert(not ok and err[ex], "no swap on /dev/null") + assert(not ok and (err.PERM or err.INVAL), "no swap on /dev/null") end, -- TODO need mkswap to test success } diff --git a/lib/ljsyscall/include/luaunit/luaunit.lua b/lib/ljsyscall/test/luaunit.lua similarity index 100% rename from lib/ljsyscall/include/luaunit/luaunit.lua rename to lib/ljsyscall/test/luaunit.lua diff --git a/lib/ljsyscall/test/netbsd.lua b/lib/ljsyscall/test/netbsd.lua index 9570d393d4..3039ed49f9 100644 --- a/lib/ljsyscall/test/netbsd.lua +++ b/lib/ljsyscall/test/netbsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/openbsd.lua b/lib/ljsyscall/test/openbsd.lua index ce2b4a7125..755c32094a 100644 --- a/lib/ljsyscall/test/openbsd.lua +++ b/lib/ljsyscall/test/openbsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/osx.lua b/lib/ljsyscall/test/osx.lua index 75ff6b932c..5346ccdbce 100644 --- a/lib/ljsyscall/test/osx.lua +++ b/lib/ljsyscall/test/osx.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/test/rump.lua b/lib/ljsyscall/test/rump.lua index d46a47cb3a..afd161209d 100644 --- a/lib/ljsyscall/test/rump.lua +++ b/lib/ljsyscall/test/rump.lua @@ -3,7 +3,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/lib/ljsyscall/include/strict/strict.lua b/lib/ljsyscall/test/strict.lua similarity index 100% rename from lib/ljsyscall/include/strict/strict.lua rename to lib/ljsyscall/test/strict.lua diff --git a/lib/ljsyscall/test/test.lua b/lib/ljsyscall/test/test.lua index 64c875dc70..fddee0a6be 100644 --- a/lib/ljsyscall/test/test.lua +++ b/lib/ljsyscall/test/test.lua @@ -4,12 +4,9 @@ arg = arg or {} --- only use this installation for tests -package.path = "./?.lua;" +local strict = require "test.strict" -local strict = require "include.strict.strict" - -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local assert = helpers.assert @@ -108,7 +105,7 @@ local function assert_equal(...) end USE_EXPECTED_ACTUAL_IN_ASSERT_EQUALS = true -- strict wants this to be set -local luaunit = require "include.luaunit.luaunit" +local luaunit = require "test.luaunit" local sysfile = debug.getinfo(S.open).source local cov = {active = {}, cov = {}} @@ -953,7 +950,7 @@ test_file_operations_at = { local fd = assert(S.open(".")) assert(util.writefile(tmpfile, teststring, "RWXU")) local stat = assert(fd:fstatat(tmpfile)) - assert(stat.size == #teststring, "expect length to br what was written") + assert(stat.size == #teststring, "expect length to be what was written") assert(fd:close()) assert(S.unlink(tmpfile)) end, @@ -1356,7 +1353,9 @@ test_sockets_pipes = { assert(ss:nonblock()) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1410,6 +1409,7 @@ test_sockets_pipes = { local ok, err = cs:connect(ba6) local as = ss:accept() local ok, err = cs:connect(ba6) + if err.ADDRNOTAVAIL or err.NETUNREACH then error "skipped" end assert(ok or err.ISCONN, "unexpected error " .. tostring(err)); assert(ss:block()) -- force accept to wait as = as or assert(ss:accept()) @@ -1455,7 +1455,9 @@ test_sockets_pipes = { assert(ss:setsockopt(c.IPPROTO.IPV6, c.IPV6.V6ONLY, 1)) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1497,7 +1499,9 @@ test_sockets_pipes = { assert(ss:setsockopt(c.IPPROTO.IPV6, c.IPV6.V6ONLY, 1)) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1534,7 +1538,9 @@ test_sockets_pipes = { local loop6 = "::1" local cs = assert(S.socket("inet6", "dgram")) local sa = assert(t.sockaddr_in6(0, loop6)) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local bsa = ss:getsockname() -- find bound address local n = assert(cs:sendto(teststring, nil, c.MSG.NOSIGNAL or 0, bsa)) -- got a sigpipe here on MIPS local f = assert(ss:recv(buf, size)) @@ -1645,7 +1651,9 @@ test_sockets_pipes = { assert(s, err) local s = assert(S.socket("inet6", "stream")) local sa = t.sockaddr_in6(0, "loopback") - assert(s:bind(sa)) + ok, err = s:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) assert_equal(s:getsockopt("socket", "keepalive"), 0) assert(s:setsockopt("socket", "keepalive", 1)) assert(s:getsockopt("socket", "keepalive") ~= 0) @@ -1668,7 +1676,9 @@ test_sockets_pipes = { assert(s, err) local s = assert(S.socket("inet6", "stream")) local sa = t.sockaddr_in6(0, "loopback") - assert(s:bind(sa)) + ok, err = s:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) assert_equal(s:getsockopt(c.IPPROTO.TCP, c.TCP.NODELAY), 0) assert(s:setsockopt(c.IPPROTO.TCP, c.TCP.NODELAY, 1)) assert(s:getsockopt(c.IPPROTO.TCP, c.TCP.NODELAY) ~= 0) @@ -1881,7 +1891,8 @@ test_termios = { local ws, err = S.stdout:ioctl("TIOCGWINSZ") if not ws and err.NOTTY then error "skipped" end -- stdout might not be a tty in test env assert(ws, err) - assert(ws.row > 0 and ws.col > 0) + if ws.row == 0 and ws.col == 0 then error "skipped" end + assert(ws.row > 0 and ws.col > 0, "expect positive winsz") end, } @@ -1946,9 +1957,6 @@ test_raw_socket = { assert(cs == expected, "expect correct ip checksum: got " .. string.format("%%%04X", cs) .. " expected " .. string.format("%%%04X", expected)) end, test_raw_udp_root = function() -- TODO create some helper functions, this is not very nice - - local h = require "syscall.helpers" -- TODO should not have to use later - local loop = "127.0.0.1" local raw = assert(S.socket("inet", "raw", "raw")) -- needed if not on Linux @@ -1972,8 +1980,8 @@ test_raw_socket = { local ca = cl:getsockname() -- TODO iphdr should have __index helpers for endianness etc (note use raw s_addr) - iphdr[0] = {ihl = 5, version = 4, tos = 0, id = 0, frag_off = h.htons(0x4000), ttl = 64, protocol = c.IPPROTO.UDP, check = 0, - saddr = sa.sin_addr.s_addr, daddr = ca.sin_addr.s_addr, tot_len = h.htons(len)} + iphdr[0] = {ihl = 5, version = 4, tos = 0, id = 0, frag_off = helpers.htons(0x4000), ttl = 64, protocol = c.IPPROTO.UDP, check = 0, + saddr = sa.sin_addr.s_addr, daddr = ca.sin_addr.s_addr, tot_len = helpers.htons(len)} --udphdr[0] = {src = sport, dst = ca.port, length = udplen} -- doesnt work with metamethods udphdr[0].src = sport @@ -2215,9 +2223,6 @@ test_proc = { local found = false if #ps == 0 then error "skipped" end -- not mounted but mount point exists for i = 1, #ps do - if ps[i].pid == 1 then - assert(ps[i].cmdline:find("init") or ps[i].cmdline:find("systemd"), "expect init or systemd to be process 1 usually") - end if ps[i].pid == me then found = true end end assert(found, "expect to find my process in ps") @@ -2234,7 +2239,6 @@ test_proc = { local p = util.proc(1) if not p.cmdline then error "skipped" end -- no files found, /proc not mounted assert(p and p.cmdline, "expect init to have cmdline") - assert(p.cmdline:find("init") or p.cmdline:find("systemd"), "expect init or systemd to be process 1 usually") end, } @@ -2305,15 +2309,13 @@ test_mmap = { test_processes = { test_nice = function() local n = assert(S.getpriority("process")) - assert_equal(n, 0, "process should start at priority 0") - local nn = assert(S.nice(1)) - assert_equal(nn, 1) - local nn = assert(S.setpriority("process", 0, 1)) -- sets to 1, which it already is + --assert_equal(n, 0, "process should start at priority 0") + --local nn = assert(S.nice(1)) + --assert_equal(nn, 1) + --local nn = assert(S.setpriority("process", 0, n)) -- sets to 1, which it already is end, test_fork_wait = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2327,8 +2329,6 @@ test_processes = { end, test_fork_waitpid = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2357,8 +2357,6 @@ test_processes = { end, test_fork_wait4 = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2373,8 +2371,6 @@ test_processes = { end, test_fork_wait3 = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2492,7 +2488,8 @@ if S.geteuid() == 0 then local i = assert(nl.interfaces()) local lo = assert(i.lo) assert(lo:up()) - assert(S.mount("none", "/sys", "sysfs")) + -- Do not destroy "/sys" if it is mounted + assert(S.statfs("/sys/kernel") or S.mount("none", "/sys", "sysfs")) end else -- not Linux -- run all tests, no namespaces available diff --git a/lib/pflua/src/pf/types.lua b/lib/pflua/src/pf/types.lua index 4ad1758864..5a727b14f8 100644 --- a/lib/pflua/src/pf/types.lua +++ b/lib/pflua/src/pf/types.lua @@ -38,10 +38,6 @@ struct pcap_pkthdr { -- with the high-bit set as negative int32_t values, so we do the same -- for all of our 32-bit values including the "k" field in BPF -- instructions. -ffi.cdef[[ -struct bpf_insn { uint16_t code; uint8_t jt, jf; int32_t k; }; -struct bpf_program { uint32_t bf_len; struct bpf_insn *bf_insns; }; -]] local bpf_program_mt = { __len = function (program) return program.bf_len end, __index = function (program, idx) @@ -50,8 +46,8 @@ local bpf_program_mt = { end } -bpf_insn = ffi.typeof("struct bpf_insn") -bpf_program = ffi.metatype("struct bpf_program", bpf_program_mt) +bpf_insn = ffi.typeof("struct { uint16_t code; uint8_t jt, jf; int32_t k; }") +bpf_program = ffi.metatype("struct { uint32_t bf_len; struct bpf_insn *bf_insns; }", bpf_program_mt) pcap_record = ffi.typeof("struct pcap_record") pcap_pkthdr = ffi.typeof("struct pcap_pkthdr") diff --git a/src/README.md b/src/README.md index 381c0e3410..c5f94415f8 100644 --- a/src/README.md +++ b/src/README.md @@ -422,10 +422,21 @@ Allocate packet and fill it with *length* bytes from *pointer*. Allocate packet and fill it with the contents of *string*. -— Function **packet.clone_to_memory* *pointer* *packet* +— Function **packet.account_free** *packet* + +Increment internal engine statistics (*frees*, *freebytes*, *freebits*) as if +*packet* were freed, but do not actually put it back onto the freelist. + +This function is intended to be used by I/O apps in special cases that need +more finegrained control over packet freeing. + +— Function **packet.free_internal** *packet* + +Free *packet* and put it back onto the freelist, but do not increment internal +engine statistics (*frees*, *freebytes*, *freebits*). + +See **packet.account_free**, **packet.free**. -Creates an exact copy of at memory pointed to by *pointer*. *Pointer* must -point to a `packet.packet_t`. — Function **packet.physical_bits* *packet* diff --git a/src/apps/test/README.md b/src/apps/test/README.md index 6d4ef5bb31..28dda5b634 100644 --- a/src/apps/test/README.md +++ b/src/apps/test/README.md @@ -79,6 +79,11 @@ Generate a random payload for each packet in `sizes`. Insert the packet number (32bit uint) directly after the ethertype. The packet number starts at 0 and is sequential on each output link. +— Key **packets** + +Emit *packets* (an array of *packets*) instead of synthesizing packets. When +this option is used *src*, *dst*, *sizes*, and *random_payload* are ignored. + ## Npackets (apps.test.npackets) The `Npackets` app allows are most N packets to flow through it. Any further diff --git a/src/apps/test/synth.lua b/src/apps/test/synth.lua index 9bddc9ce44..c3de931b1f 100644 --- a/src/apps/test/synth.lua +++ b/src/apps/test/synth.lua @@ -16,6 +16,7 @@ Synth = { packets = {}, random_payload = { default = false }, packet_id = { default = false }, + packets = {} } } @@ -42,7 +43,10 @@ function Synth:new (conf) packets[i] = dgram:packet() end end - return setmetatable({cursor=1, packets=packets}, {__index=Synth}) + return setmetatable( + {cursor=0, pktid=(conf.packet_id and 0), packets=packets}, + {__index=Synth} + ) end function Synth:pull () @@ -51,17 +55,17 @@ function Synth:pull () for _, o in ipairs(self.output) do local cursor = self.cursor for _ = 1, burst do - local c = packet.clone(packets[cursor]) + local p = packet.clone(packets[1+cursor]) if self.packet_id then -- 14 == sizeof(dstmac srcmac type) - ffi.cast("uint32_t *", clone.data+14)[0] = lib.htonl(self.pktid) + ffi.cast("uint32_t *", p.data+14)[0] = lib.htonl(self.pktid) self.pktid = self.pktid + 1 end - transmit(o, c) - cursor = (cursor + 1) % npackets + 1 + transmit(o, p) + cursor = (cursor + 1) % npackets end end - self.cursor = (self.cursor + burst) % npackets + 1 + self.cursor = (self.cursor + burst) % npackets end function Synth:stop () diff --git a/src/apps/xdp/README.md b/src/apps/xdp/README.md new file mode 100644 index 0000000000..e8335e18ae --- /dev/null +++ b/src/apps/xdp/README.md @@ -0,0 +1,69 @@ +# XDP socket app (apps.xdp.xdp) + +The `XDP` app implements a driver for Linux `AF_XDP` sockets. + +Its links are named `input` and `output`. + + DIAGRAM: XDP + +-----------+ + | | + input ---->* XDP *----> output + | | + +-----------+ + +**Important:** To use the _XDP_ app, “Snabb XDP mode“ must be enabled by +calling `xdp.snabb_enable_xdp()`. Calling this function replaces Snabb's native +memory allocator with the _UMEM_ allocator. The caller must ensure that no +packets have been allocated via `packet.allocate()` prior to calling this +function. + +## _Caveats_ + + * Memory allocated by the UMEM allocator can not be used with _DMA_ + drivers: using the XDP app precludes the use of Snabb’s native + hardware drivers such as `apps.intel_mp.intel_mp`. + + * Memory allocated by the UMEM allocator can not be shared with + other Snabb processes in the same process group: using + snabb_enable_xdp precludes the use of Interlink apps + (`apps.interlink`). + +## Maximum MTU + +Due to a combination of how Snabb uses packet buffers and a limitation of +`AF_XDP` the effective maximum MTU of the XDP app is 3,582. + +## Configuration + +— Key **ifname** + +*Required*. The name of the interface as shown in `ip link`. + +— Key **queue** + +*Optional*. Queue to bind to (zero based). The default is queue 0. + +## Module functions + +— Function **snabb_enable_xdp** *options* + +Enables “Snabb XDP mode”. See _Caveats_! + +### *Options* + +*Options* is a table of configuration options. The following parameters are +supported: + + - `num_chunks`—number of UMEM chunks to allocate. The default is 200,000 which + might not be enough depending on the number of XDP sockets used by the + process. Each instance of the XDP app uses up to around 25,000 chunks at any + time. However, generous over-provisioning (at least double of the expected + residency) is recommended due to buffering in the Snabb engine. + +## Setting up XDP capable devices under Linux + +``` +$ echo 0000:01:00.0 > /sys/bus/pci/drivers/ixgbe/bind +$ ip link set ens1f0 addr 02:00:00:00:00:00 +$ ethtool --set-channels ens1f0 combined 1 +``` diff --git a/src/apps/xdp/bpf.lua b/src/apps/xdp/bpf.lua new file mode 100644 index 0000000000..44b4d68059 --- /dev/null +++ b/src/apps/xdp/bpf.lua @@ -0,0 +1,182 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local ffi = require("ffi") +local bor = bit.bor + +-- BPF: just enough eBPF to assemble trivial XDP programs. +-- +-- See "BPF Architecture": +-- https://docs.cilium.io/en/v1.6/bpf/#bpf-architecture +-- +-- See Linux v4.19: +-- include/uapi/linux/bpf_common.h +-- include/uapi/linux/bpf.h +-- tools/include/linux/filter.h + +ins = ffi.typeof[[ + struct { + uint8_t op; /* opcode */ + uint8_t dst:4; /* dest register */ + uint8_t src:4; /* source register */ + int16_t off; /* signed offset */ + int32_t imm; /* signed immediate constant */ + } __attribute__((packed)) +]] + +c = { -- Op class + LD = 0x00, + LDX = 0x01, + ST = 0x02, + STX = 0x03, + ALU = 0x04, + JMP = 0x05, + RET = 0x06, + MISC = 0x07, + ALU64 = 0x07 -- alu mode in double word width +} + +f = { -- Load/store width + W = 0x00, -- 32-bit + H = 0x08, -- 16-bit + B = 0x10, -- 8-bit + DW = 0x18 -- 64-bit +} + +m = { -- Op mode + IMM = 0x00, + ABS = 0x20, + IND = 0x40, + MEM = 0x60, + LEN = 0x80, + MSH = 0xa0, + XADD = 0xc0 -- exclusive add +} + +a = { -- ALU mode + ADD = 0x00, + SUB = 0x10, + MUL = 0x20, + DIV = 0x30, + OR = 0x40, + AND = 0x50, + LSH = 0x60, + RSH = 0x70, + NEG = 0x80, + MOD = 0x90, + XOR = 0xa0, + MOV = 0xb0 +} + +s = { -- Src mode + K = 0x00, + X = 0x08, + MAP_FD = 0x01 +} + +j = { -- JMP mode + JA = 0x00, + JEQ = 0x10, + JGT = 0x20, + JGE = 0x30, + JSET = 0x40, + JNE = 0x50, + JLT = 0xa0, + JLE = 0xb0, + JSGT = 0x60, + JSGE = 0x70, + JSLT = 0xc0, + JSLE = 0xd0, + CALL = 0x80, + EXIT = 0x90 +} + +fn = { -- Built-in helpers + unspec = 0, + map_lookup_elem = 1, + map_update_elem = 2, + map_delete_elem = 3, + probe_read = 4, + ktime_get_ns = 5, + trace_printk = 6, + get_prandom_u32 = 7, + get_smp_processor_id = 8, + skb_store_bytes = 9, + l3_csum_replace = 10, + l4_csum_replace = 11, + tail_call = 12, + clone_redirect = 13, + get_current_pid_tgid = 14, + get_current_uid_gid = 15, + get_current_comm = 16, + get_cgroup_classid = 17, + skb_vlan_push = 18, + skb_vlan_pop = 19, + skb_get_tunnel_key = 20, + skb_set_tunnel_key = 21, + perf_event_read = 22, + redirect = 23, + get_route_realm = 24, + perf_event_output = 25, + skb_load_bytes = 26, + get_stackid = 27, + csum_diff = 28, + skb_get_tunnel_opt = 29, + skb_set_tunnel_opt = 30, + skb_change_proto = 31, + skb_change_type = 32, + skb_under_cgroup = 33, + get_hash_recalc = 34, + get_current_task = 35, + probe_write_user = 36, + current_task_under_cgroup = 37, + skb_change_tail = 38, + skb_pull_data = 39, + csum_update = 40, + set_hash_invalid = 41, + get_numa_node_id = 42, + skb_change_head = 43, + xdp_adjust_head = 44, + probe_read_str = 45, + get_socket_cookie = 46, + get_socket_uid = 47, + set_hash = 48, + setsockopt = 49, + skb_adjust_room = 50, + redirect_map = 51, + sk_redirect_map = 52, + sock_map_update = 53, + xdp_adjust_meta = 54, + perf_event_read_value = 55, + perf_prog_read_value = 56, + getsockopt = 57, + override_return = 58, + sock_ops_cb_flags_set = 59, + msg_redirect_map = 60, + msg_apply_bytes = 61, + msg_cork_bytes = 62, + msg_pull_data = 63, + bind = 64, + xdp_adjust_tail = 65, + skb_get_xfrm_state = 66, + get_stack = 67, + skb_load_bytes_relative = 68, + fib_lookup = 69, + sock_hash_update = 70, + msg_redirect_hash = 71, + sk_redirect_hash = 72, + lwt_push_encap = 73, + lwt_seg6_store_bytes = 74, + lwt_seg6_adjust_srh = 75, + lwt_seg6_action = 76, + rc_repeat = 77, + rc_keydown = 78, + skb_cgroup_id = 79, + get_current_cgroup_id = 80, + get_local_storage = 81, + sk_select_reuseport = 82, + skb_ancestor_cgroup_id = 83, +} + +function asm (insn) return ffi.typeof("$[?]", ins)(#insn, insn) end diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua new file mode 100644 index 0000000000..7ce28c8dc9 --- /dev/null +++ b/src/apps/xdp/xdp.lua @@ -0,0 +1,956 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local S = require("syscall") +local ffi = require("ffi") +local bpf = require("apps.xdp.bpf") +local lib = require("core.lib") +local bits = lib.bits +local band, bor, rshift, tobit = bit.band, bit.bor, bit.rshift, bit.tobit + +-- ---- XDP driver for Snabb -------------------------------------------- + +-- This is a Snabb driver for Linux AF_XDP[1][2] sockets. The XDP kernel +-- interface presents an ABI/API combination similar to what a hardware NIC +-- usually provides: a way to attach to hardware queues, and a set of +-- descriptor rings for each queue used to enqueue and dequeue packet memory +-- buffers. +-- +-- Like with hardware NICs, XDP imposes us with constraints on the kind of +-- memory buffers we can enqueue onto its descriptor rings. Instead of DMA +-- memory required to drive hardware NICs, XDP requires us to register a +-- special kind of memory called UMEM to use with an AF_XDP socket. Only +-- buffers in the UMEM registered with a given socket can be used for I/O with +-- that socket! +-- +-- To consolidate this and other constraints (see "UMEM allocation" below) with +-- Snabb's packet memory architecture this driver allocates a single contiguous +-- memory region used as UMEM for all of the process' AF_XDP sockets, and +-- replaces the memory allocation routine dma_alloc in core.memory with its own +-- UMEM allocator. Hence, the packet freelist will be filled with UMEM memory +-- buffers used for all packet allocations. +-- +-- snabb_enable_xdp() +-- +-- To use the XDP app, "Snabb XDP mode" must be enabled by calling this +-- function. Calling this function replaces Snabb's native memory +-- allocator with the UMEM allocator. +-- +-- The caller must ensure that no packets have been allocated via +-- packet.allocate() prior to calling this function. +-- +-- CAVEATS: +-- +-- * Memory allocated by the UMEM allocator can not be used with DMA +-- drivers: using the XDP app precludes the use of Snabb's native +-- hardware drivers. +-- +-- * Memory allocated by the UMEM allocator can not be shared with +-- other Snabb processes in the same process group: using +-- snabb_enable_xdp precludes the use of Interlink apps +-- (apps.interlink). +-- +-- * UMEM chunks can not be larger than the page size (4096 bytes). +-- This AD_XDP limitation plus the way Snabb implements packet +-- buffer shifting operations limits the effective MTU: the MTU of +-- the XDP app is limited to 3,582 bytes. See XDP:create_xsk(). +-- +-- The only means by which an AF_XDP socket can receive packets from a device +-- is by attaching an eBPF XDP program to the Linux interface. The XDP app +-- assembles a minimal BPF program to route packets from device queues to XDP +-- sockets. See XDP:initialize_xdp. +-- +-- References: +-- [1] https://www.kernel.org/doc/html/v5.3/networking/af_xdp.html +-- [2] The Linux kernel source repository + + +-- ---- UMEM allocation ------------------------------------------------- + +-- Must maintain invariants: chunk size must be <= page size and UMEM must be +-- aligned to page size. + +local page_size = S.getpagesize() +local chunk_size = page_size +local num_chunks = 200000 +local umem_backing, umem, umem_size, umem_used + +-- UMEM allocator: multiple UMEM chunks must be allocated to fit a full packet. +-- However, AF_XDP sockets will only ever see the first of the chunks that make +-- up a packet. The extra (two) UMEM chunks are effectively unused by the +-- socket (but used by Snabb to ensure that packets can actually use +-- packet.max_payload bytes of payload). +-- See core.packet, "XDP rings", XDP:create_xsk(). +local function umem_alloc (size, align) + -- NB: align parameter ignored as we align to chunk_size + assert(align <= chunk_size) + assert(umem_used + size <= umem_size, + "Out of packet buffer memory. Increase num_chunks?") + local chunk = umem + umem_used + umem_used = lib.align(umem_used + size, chunk_size) + return chunk +end + +-- Convert from pointer to relative UMEM offset. +local function to_umem (ptr) + return ffi.cast("uintptr_t", ptr) - ffi.cast("uintptr_t", umem) +end + +-- Convert relative UMEM offset to pointer. +local function from_umem (offset) + return umem + offset +end + +local snabb_xdp_enabled = false +function snabb_enable_xdp (opt) + opt = opt or {} + if opt.num_chunks then + num_chunks = math.ceil(assert(tonumber(opt.num_chunks), + "num_chunks must be a number")) + end + -- Allocate UMEM + umem_size = chunk_size * num_chunks + umem_backing = ffi.new("char[?]", umem_size + page_size) + umem = ffi.cast("char*", lib.align(ffi.cast("uintptr_t", umem_backing), page_size)) + umem_used = 0 + -- Hot-swap core.memory.dma_alloc + require("core.memory").dma_alloc = umem_alloc + snabb_xdp_enabled = true +end + + +-- ---- FFI types ------------------------------------------------------- + +local xdp_umem_reg_t = ffi.typeof[[ + struct { + void * addr; /* Start of packet data area */ + uint64_t len; /* Length of packet data area */ + uint32_t chunk_size; + uint32_t headroom; + uint32_t flags; /* Not available in 4.19 */ + } __attribute__((packed))]] + +local sockaddr_xdp_t = ffi.typeof[[ + struct { + uint16_t family; + uint16_t flags; + uint32_t ifindex; + uint32_t queue_id; + uint32_t shared_umem_fd; + } __attribute__((packed))]] + +local xdp_ring_offset_t = ffi.typeof[[ + struct { + uint64_t producer; + uint64_t consumer; + uint64_t desc; + uint64_t flags; /* Not available in 4.19 */ + } __attribute__((packed))]] + +local xdp_ring_offset_noflags_t = ffi.typeof[[ + struct { + uint64_t producer; + uint64_t consumer; + uint64_t desc; + } __attribute__((packed))]] + +local xdp_mmap_offsets_templ = [[ + struct { + $ rx, + tx, + fr, /* Fill */ + cr; /* Completion */ + } __attribute__((packed))]] +local xdp_mmap_offsets_noflags_t = + ffi.typeof(xdp_mmap_offsets_templ, xdp_ring_offset_noflags_t) +local xdp_mmap_offsets_t = + ffi.typeof(xdp_mmap_offsets_templ, xdp_ring_offset_t) + +local xdp_ring_t = ffi.typeof[[ + struct { + char *map; + size_t maplen; + uint32_t *producer, *consumer, *flags; + void *desc; + uint32_t write, read; + }]] + +local xdp_desc_t = ffi.typeof[[ + struct { + uint64_t addr; + uint32_t len; + uint32_t options; + } __attribute__((packed))]] +local xdp_desc_ptr_t = ffi.typeof("$ *", xdp_desc_t) + +local netlink_set_link_xdp_request_t = ffi.typeof[[ + struct { + struct { /* nlmsghdr */ + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message content */ + uint16_t nlmsg_flags; /* Additional flags */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ + } nh; + struct { /* ifinfomsg */ + unsigned char ifi_family; + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Link index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ + } ifinfo; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + } xdp; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + int32_t fd; + } xdp_fd; + }__attribute__((packed))]] + + +-- ---- XDP rings ------------------------------------------------------- + +-- Ring operations for the single-producer single-consumer rings used for I/O +-- with AF_XDP sockets (xdp_ring_t). This is is a blend between an +-- "Array + two unmasked indices"[1] and MCRingBuffer[2] implementation. +-- +-- Only the "Array + two unmasked indices" half of the implementation is +-- actually exposed by the kernel via the pointers to shared consumer/producer +-- fields (see xdp_ring_t, XDP:xdp_map_ring()). The MCRingBuffer portion is +-- added by userspace (us) to optimize our CPU cache footprint. +-- +-- Each AF_XDP socket has two rings (rx, tx) and each UMEM has two rings +-- (fr - fill ring, cr - completion ring). This XDP driver registers a new UMEM +-- for each socket so that each socket effectively has four rings +-- (rx, tx, fr, cr). +-- +-- For the Linux kernel to be able to fill the rx ring we need to provide it +-- UMEM chunks via the fill ring (fr). Chunks used by us to send packets via +-- the tx ring are returned by the kernel back to the userspace application via +-- the completion ring (cr). +-- +-- It is important to note that XDP rings operate on chunks: the addr field +-- of xdp_desc_t points *into* a chunk, and its len field is, from the kernel’s +-- perspective, bounded to the end of that chunk. See "UMEM allocation" and +-- XDP:create_xsk() for how this affects Snabb. +-- +-- NB: Snabb packet payloads are preceded by a two byte length field, so we +-- have to account for this overhead when retrieving packets from XDP +-- descriptor rings. See receive(r) below and XDP:create_xsk(). +-- +-- References: +-- [1] https://www.snellman.net/blog/archive/2016-12-13-ring-buffers/ +-- [2] https://www.cse.cuhk.edu.hk/~pclee/www/pubs/ancs09poster.pdf + +local xdp_ring_ndesc = 2048 -- Number of descriptors in ring. + +local function mask (i) return band(i, xdp_ring_ndesc - 1) end +local function inc (i) return tobit(i + 1) end +local function full1 (r, w) return tobit(w - r) == xdp_ring_ndesc end + +function full (r) + if full1(r.read, r.write) then + if full1(r.consumer[0], r.write) then + return true + end + r.read = r.consumer[0] + end +end + +function transmit (r, p) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.write) + desc[idx].addr = to_umem(p.data) + desc[idx].len = p.length + r.write = inc(r.write) +end + +function fill (r, p) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.write) + desc[idx] = to_umem(p) + r.write = inc(r.write) +end + +function push (r) + -- NB: no need for memory barrier on x86 because of TSO. + r.producer[0] = r.write +end + +function empty (r) + if r.read == r.write then + if r.read == r.producer[0] then + return true + end + r.write = r.producer[0] + end +end + +local packet_overhead = 2 -- leading struct packet length field (uint16_t) +function receive (r) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", + -- packet struct begins at payload - packet_overhead + from_umem(desc[idx].addr) - packet_overhead) + p.length = desc[idx].len + r.read = inc(r.read) + return p +end + +function reclaim (r) + -- NB: reclaim does not (re)set the payload length field. + -- Reclaimed packets do *not* have known payload lengths! + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", from_umem(desc[idx])) + r.read = inc(r.read) + return p +end + +function pull (r) + -- NB: no need for memory barrier on x86 (see push.) + r.consumer[0] = r.read +end + +function needs_wakeup (r) + -- NB: Unavailable when kernel does not support ring flags. + -- See: XDP.kernel_has_ring_flags, XDP:create_xsk(), XDP:kick() + return band(r.flags[0], bits{XDP_RING_NEED_WAKEUP=1}) +end + +-- Rewind routines for transmit/fill. These are used by XDP:stop() to reclaim +-- packet buffers left in-fight after shutdown. + +function rewind_transmit (r) + r.write = tobit(r.write - 1) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.write) + return ffi.cast("struct packet *", + -- packet struct begins at payload - packet_overhead + from_umem(desc[idx].addr) - packet_overhead) +end + +function rewind_fill (r) + r.write = tobit(r.write - 1) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.write) + return ffi.cast("struct packet *", from_umem(desc[idx])) +end + + +-- ---- XDP App --------------------------------------------------------- + +XDP = { + config = { + ifname = {required=true}, -- interface name + queue = {default=0} -- interface queue (zero based) + }, + -- Class variables: + kernel_has_ring_flags = true -- feature detection status for descriptor ring flags +} + +-- Class methods + +function XDP:new (conf) + assert(snabb_xdp_enabled, "Snabb XDP mode must be enabled.") + -- Ensure interface is initialized for XDP usage. + local lockfd, mapfd = self:open_interface(conf.ifname) + -- Create XDP socket (xsk) for queue. + local xsk = self:create_xsk(conf.ifname, lockfd, conf.queue) + -- Attach the socket to queue in the BPF map. + self:set_queue_socket(mapfd, conf.queue, xsk) + mapfd:close() -- not longer needed + -- Finish initialization. + return setmetatable(xsk, {__index=XDP}) +end + +function XDP:open_interface (ifname) + -- Open an interface-dependent file we know should exist to use as a + -- Snabb-wide lock. The contents of the file are really irrelevant here. + -- However, we depend on the file not being locked by other applications in + -- general. :-) + local lockfd = S.open("/sys/class/net/"..ifname.."/operstate", "rdonly") + local mapfd, progfd + local xskmap_path = "/sys/fs/bpf/snabb/"..ifname.."/xskmap" + local prog_path = "/sys/fs/bpf/snabb/"..ifname.."/xdp" + -- If the open above failed we assume that no device by ifname exists. + assert(lockfd, "Could not open interface: "..ifname.." (does it exist?)") + if lockfd:flock("ex, nb") then + -- If we get an exclusive lock we know that no other Snabb processes are + -- using the interface so its safe to setup the interface and replace any + -- existsing BPF XDP program/maps attached to it. + S.mkdir("/sys/fs/bpf/snabb", "rwxu, rgrp, xgrp, roth, xoth") + S.util.rm("/sys/fs/bpf/snabb/"..ifname) + S.mkdir("/sys/fs/bpf/snabb/"..ifname, "rwxu, rgrp, xgrp, roth, xoth") + -- Create xskmap and XDP program to run on the NIC. + mapfd = self:create_xskmap() + progfd = self:xdp_prog(mapfd) + self:set_link_xdp(ifname, progfd) + -- Pin xskmap so it can be accessed by other Snabb processes to attach to + -- the interface. Also pin the XDP program, just 'cause. + assert(S.bpf_obj_pin(xskmap_path, mapfd)) + assert(S.bpf_obj_pin(prog_path, progfd)) + progfd:close() -- no longer needed + lockfd:flock("sh") -- share lock + else + lockfd:flock("sh") + -- Wait for the lock to be shared: once it is no longer held exclusively + -- we know that the interface is setup and ready to use. + -- Get the currently pinned xskmap to insert our XDP socket into. + mapfd = assert(S.bpf_obj_get(xskmap_path)) + end + -- lockfd: holds a shared lock for as long as we do not close it, signaling + -- other Snabb processes that the interface is in use. + -- mapfd: the xskmap for the interface used to + -- attach XDP sockets to queues. + return lockfd, mapfd +end + +function XDP:create_xskmap () + local klen, vlen = ffi.sizeof("int"), ffi.sizeof("int") + local nentries = 128 + local map, err + for _ = 1,7 do + -- Try to create BPF map. + map, err = S.bpf_map_create('xskmap', klen, vlen, nentries) + -- Return map on success. + if map then return map end + -- Failed to create map, increase MEMLOCK limit and retry. + -- See /~https://github.com/xdp-project/xdp-tutorial/issues/63 + local lim = assert(S.getrlimit('memlock')) + assert(S.setrlimit('memlock', {cur=lim.cur*2, max=lim.max*2})) + end + -- Exceeded retries, bail. + error("Failed to create BPF map: "..tostring(err)) +end + +function XDP:xdp_prog (xskmap) + -- Assemble and load XDP BPF program. + local c, f, m, a, s, j, fn = + bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j, bpf.fn + -- The program below looks up the incoming packet's queue index in xskmap to + -- find the corresponding XDP socket (xsk) to deliver the packet to. + local insns = bpf.asm{ + -- r3 = XDP_ABORTED + { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, + -- r2 = ((struct xdp_md *)ctx)->rx_queue_index + { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, + -- r1 = xskmap + { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, imm=xskmap:getfd() }, + { imm=0 }, -- nb: upper 32 bits of 64-bit (DW) immediate + -- r0 = redirect_map(r1, r2, r3) + { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, + -- EXIT: + { op=bor(c.JMP, j.EXIT) } + } + local prog, err, log = S.bpf_prog_load( + 'xdp', insns, ffi.sizeof(insns) / ffi.sizeof(bpf.ins), "Apache 2.0" + ) + if prog then + return prog + else + error(tostring(err).."\n"..log) + end +end + +function XDP:set_link_xdp(ifname, prog) + -- Open a NETLINK socket, and transmit command that attaches XDP program + -- prog to link by ifname. + local netlink = assert(S.socket('netlink', 'raw', 'route')) + local SOL_NETLINK = 270 + local NETLINK_EXT_ACK = 11 + local ext_ack_on = ffi.new("int[1]", 1) + assert(S.setsockopt(netlink, SOL_NETLINK, NETLINK_EXT_ACK, + ext_ack_on, ffi.sizeof(ext_ack_on))) + local IFLA_XDP = 43 + local IFLA_XDP_FD = 1 + local IFLA_XDP_FLAGS = 3 + local request = ffi.new( + netlink_set_link_xdp_request_t, + { nh = { nlmsg_flags = bor(S.c.NLM_F.REQUEST, S.c.NLM_F.ACK), + nlmsg_type = S.c.RTM.SETLINK }, + ifinfo = { ifi_family = S.c.AF.UNSPEC, + ifi_index = S.util.if_nametoindex(ifname) }, + xdp = { nla_type = bor(bits{ NLA_F_NESTED=15 }, IFLA_XDP) }, + xdp_fd = { nla_type = IFLA_XDP_FD, + fd = prog:getfd() } } + ) + request.nh.nlmsg_len = ffi.sizeof(request) + request.xdp.nla_len = ffi.sizeof(request.xdp) + ffi.sizeof(request.xdp_fd) + request.xdp_fd.nla_len = ffi.sizeof(request.xdp_fd) + assert(netlink:send(request, ffi.sizeof(request))) + local response = assert(S.nl.read(netlink, nil, nil, true)) + if response.error then + error("NETLINK responded with error: "..tostring(response.error)) + end + netlink:close() +end + +function XDP:create_xsk (ifname, lockfd, queue) + local xsk = { sock = assert(S.socket('xdp', 'raw')), lockfd = lockfd } + -- Register UMEM. + local umem_reg = ffi.new( + xdp_umem_reg_t, + { addr = umem, + len = umem_size, + -- The chunk size is equal to the page size (4096 bytes, see + -- "UMEM allocation"), and XDP packet descriptors point to individual + -- chunks (see "XDP rings"). Hence, the MTU of AF_XDP sockets is + -- limited to the page size, and the effective MTU of the XDP app is + -- further limited by the way core.packet implements packet shifting + -- operations (see headroom below). The effective MTU is calculated as + -- 4096 - packet.packet_alignment (512) - packet_overhead (2) = 3582 + chunk_size = chunk_size, + -- By configuring the headroom according to core.packet we make sure + -- that XDP leaves enough headroom for the preceeding length field of + -- Snabb's struct packet as well as headroom for packet shifting + -- operations. + headroom = packet.default_headroom + packet_overhead, + -- flags = bits{ XDP_UMEM_UNALIGNED_CHUNK_FLAG=1 } + } + ) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_reg', umem_reg, ffi.sizeof(umem_reg))) + -- Configure XDP rings and map them into this process’ memory. + local ndesc = ffi.new("int[1]", xdp_ring_ndesc) + assert(xsk.sock:setsockopt('xdp', 'xdp_rx_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_tx_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_fill_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_completion_ring', ndesc, ffi.sizeof(ndesc))) + local layouts = ffi.new(xdp_mmap_offsets_t) + if not pcall(S.getsockopt, xsk.sock, 'xdp', 'xdp_mmap_offsets', layouts, ffi.sizeof(layouts)) then + -- Kernel appears not to support XDP ring flags field. Disable feature, + -- and retry with xdp_mmap_offsets_noflags_t. + self.kernel_has_ring_flags = false + layouts = ffi.new(xdp_mmap_offsets_noflags_t) + assert(xsk.sock:getsockopt('xdp', 'xdp_mmap_offsets', layouts, ffi.sizeof(layouts))) + end + xsk.rx = self:xdp_map_ring(xsk.sock, layouts.rx, xdp_desc_t, 0x000000000ULL) -- XDP_PGOFF_RX_RING + xsk.tx = self:xdp_map_ring(xsk.sock, layouts.tx, xdp_desc_t, 0x080000000ULL) -- XDP_PGOFF_TX_RING + -- NB: fill and completion rings do not carry full descriptors, only + -- relative UMEM offsets (addr). + xsk.fr = self:xdp_map_ring(xsk.sock, layouts.fr, "uint64_t", 0x100000000ULL) -- XDP_UMEM_PGOFF_FILL_RING + xsk.cr = self:xdp_map_ring(xsk.sock, layouts.cr, "uint64_t", 0x180000000ULL) -- XDP_UMEM_PGOFF_COMPLETION_RING + -- Counters to track packets in-flight through kernel. + -- - rxq is incremented when a packet buffer is enqueued onto the + -- fill ring and decremented when a packet buffer is dequeued from the + -- tx ring. I.e., it tracks the number of unused buffers currently left + -- on the fill ring. + -- - txq is incremented when a packet buffer is enqueued onto the tx ring + -- and decremented then a packet buffer is dequeued from the + -- completion ring. I.e, it tracks number of unused buffers currently + -- left on the tx ring. + -- The rxq and txq tallies are used by XDP:stop() to perform a clean + -- socket shutdown without leaking packet buffers. + xsk.rxq = 0 + xsk.txq = 0 + -- Bind socket to interface + local sa = ffi.new( + sockaddr_xdp_t, + { family = S.c.AF.XDP, + ifindex = S.util.if_nametoindex(ifname), + queue_id = queue, + -- flags = bits{ XDP_ZEROCOPY=2 } + } + ) + local ok, err = xsk.sock:bind(sa, ffi.sizeof(sa)) + if not ok then + error(("Unable to bind AF_XDP socket to %s queue %d (%s)") + :format(ifname, queue, err)) + end + return xsk +end + +-- Map an XDP socket ring into this process’ memory. +function XDP:xdp_map_ring (socket, layout, desc_t, offset) + local prot = "read, write" + local flags = "shared, populate" + local r = ffi.new(xdp_ring_t) + r.maplen = layout.desc + xdp_ring_ndesc * ffi.sizeof(desc_t) + r.map = assert(S.mmap(nil, r.maplen, prot, flags, socket, offset)) + r.producer = ffi.cast("uint32_t *", r.map + layout.producer) + r.consumer = ffi.cast("uint32_t *", r.map + layout.consumer) + if self.kernel_has_ring_flags then + r.flags = ffi.cast("uint32_t *", r.map + layout.flags) + end + r.desc = r.map + layout.desc + return r +end + +function XDP:set_queue_socket(xskmap, queue, xsk) + assert(S.bpf_map_op('map_update_elem', xskmap, + ffi.new("int[1]", queue), + ffi.new("int[1]", xsk.sock:getfd()))) +end + +-- Instance methods + +function XDP:stop () + -- Close socket. + self.sock:close() + -- Reclaim packet buffers left on rings. + -- + -- Problem: we need a way to tell apart which packets buffers on the + -- (write-only) tx and fill rings need to be freed, and which packet buffers + -- were already enqueued to the (read-only) rx and completions rings. + -- Otherwise, we might cause memory corruption by double-freeing packets. + -- + -- We can not however reliably inspect the kernel's internal read cursors + -- for the tx and fill rings. Instead we solve this with a *hack* based on + -- the assumptions that 1) the kernel does not modify the rings after + -- closing the XDP socket; 2) the kernel moves packets from fill to rx rings + -- and tx to completion rings *in-order*; 3) the kernel does not clobber + -- descriptors that have not yet moved to an rx or completion ring. + -- + -- First we flush the rx and completion rings, freeing any dequeued packets, + -- while updating the rxq and txq tallies (see XDP:create_xsk()). + while not empty(self.rx) do + packet.free_internal(receive(self.rx)) + self.rxq = self.rxq - 1 + end + while not empty(self.cr) do + packet.free_internal(reclaim(self.cr)) + self.txq = self.txq - 1 + end + -- Then, we use the final rxq/txq tallies to infer how many packets on the + -- transmit and fill rings are left dangling, and free those amounts of + -- packets (starting from the most recently enqueued, going backwards) from + -- each ring individually. + for _ = 1, self.txq do + packet.free_internal(rewind_transmit(self.tx)) + end + for _ = 1, self.rxq do + packet.free_internal(rewind_fill(self.fr)) + end + -- Unmap rings. + assert(S.munmap(self.rx.map, self.rx.maplen)) + assert(S.munmap(self.tx.map, self.tx.maplen)) + assert(S.munmap(self.fr.map, self.fr.maplen)) + assert(S.munmap(self.cr.map, self.cr.maplen)) + -- Close interface lockfd. See XDP:open_interface(). + self.lockfd:close() +end + +function XDP:pull () + local output = self.output.output + local rx = self.rx + self:refill() + if not output then return end + for _ = 1, engine.pull_npackets do + if empty(rx) then break end + link.transmit(output, receive(rx)) + self.rxq = self.rxq - 1 + end + pull(rx) +end + +function XDP:push () + local input = self.input.input + local tx = self.tx + if not input then return end + while not link.empty(input) and not full(tx) do + local p = link.receive(input) + transmit(tx, p) + self.txq = self.txq + 1 + -- Stimulate breathing: after the kernel is done with the packet buffer + -- it will either be fed back from the completion ring onto the free + -- ring, or put back onto the freelist via packet.free_internal; hence, + -- account statistics for freed packet here in order to signal to the + -- engine that throughput is happening. + packet.account_free(p) + end + push(tx) + if self.kernel_has_ring_flags then + if needs_wakeup(tx) then self:kick() end + else + if full(tx) then self:kick() end + end +end + +function XDP:refill () + local input, output = self.input.input, self.output.output + local fr, cr = self.fr, self.cr + -- If the queue operates in duplex mode (i.e., has both input and output + -- links attached) we feed packet buffers from the completion ring back onto + -- the fill ring. + if input and output then + while not (empty(cr) or full(fr)) do + fill(fr, reclaim(cr)) + self.txq = self.txq - 1 + self.rxq = self.rxq + 1 + end + end + -- If the queue has its output attached we make sure that the kernel does + -- not run out of packet buffers to fill the rx ring with by keeping the + -- fill ring topped up with fresh packets. + -- (If no input is attached, the completion ring is not used, and + -- all packet buffers for rx will be allocated here.) + if output then + while not full(fr) do + fill(fr, packet.allocate()) + self.rxq = self.rxq + 1 + end + end + -- If the queue has its input attached we release any packet buffers + -- remaining in the completion ring back to the packet freelist. + -- (If not output is attached, the fill ring is not used, and + -- all packet buffers used for tx will be reclaimed here.) + if input then + while not empty(cr) do + -- NB: mandatory free_internal since we do not know the payload length + -- of reclaimed packets. + packet.free_internal(reclaim(cr)) + self.txq = self.txq - 1 + end + end + push(fr) + pull(cr) +end + +function XDP:kick () + -- Wake up Linux kernel to process tx ring packets. + self.sock:sendto(nil, 0, 'dontwait', nil, 0) +end + + +-- ---- Tests ----------------------------------------------------------- + +-- Useful setup commands: +-- $ echo 0000:01:00.0 > /sys/bus/pci/drivers/ixgbe/bind +-- $ ip link set ens1f0 addr 02:00:00:00:00:00 +-- $ ethtool --set-channels ens1f0 combined 1 + +function selftest () + print("selftest: apps.xdp.xdp") + local xdpdeva = lib.getenv("SNABB_XDP0") + local xdpmaca = lib.getenv("SNABB_XDP_MAC0") + local xdpdevb = lib.getenv("SNABB_XDP1") + local xdpmacb = lib.getenv("SNABB_XDP_MAC1") + local nqueues = tonumber(lib.getenv("SNABB_XDP_NQUEUES")) or 1 + if not (xdpdeva and xdpmaca and xdpdevb and xdpmacb) then + print("SNABB_XDP0 and SNABB_XDP1 must be set. Skipping selftest.") + os.exit(engine.test_skipped_code) + end + snabb_enable_xdp() + engine.report_load() + print("test: rxtx") + selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("test: duplex") + selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("test: rxtx_match") + selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + if nqueues > 1 then + print("test: share_interface") + selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + end + print("selftest ok") +end + +local function random_v4_packets (conf) + local ethernet = require("lib.protocol.ethernet") + local ipv4 = require("lib.protocol.ipv4") + local eth = ethernet:new{src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = 0x0800} + local packets = {} + for _, size in ipairs(conf.sizes) do + for _=1,100 do + local ip = ipv4:new{src=lib.random_bytes(4), + dst=lib.random_bytes(4)} + ip:total_length(size - eth:sizeof()) + local payload_length = ip:total_length() - ip:sizeof() + local p = packet.allocate() + packet.append(p, eth:header(), eth:sizeof()) + packet.append(p, ip:header(), ip:sizeof()) + packet.append(p, lib.random_bytes(payload_length), payload_length) + table.insert(packets, p) + end + end + return packets +end + +function selftest_rxtx (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-1 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source.output"..queue.." -> "..queue_a..".input") + config.link(c, queue_b..".output -> sink.input"..queue) + end + engine.configure(c) + print("kernel_has_ring_flags", XDP.kernel_has_ring_flags) + engine.main{ duration=1 } + engine.report_links() + local txtotal, rxtotal = 0, 0 + for queue = 0, nqueues-1 do + local tx = link.stats(engine.app_table.source.output["output"..queue]) + local rx = link.stats(engine.app_table.sink.input["input"..queue]) + assert(tx.rxpackets > 0, "No packets sent on queue: "..queue) + assert(rx.rxpackets > 0, "No packets received on queue: "..queue) + txtotal = txtotal + tx.rxpackets + rxtotal = rxtotal + rx.rxpackets + end + assert(math.abs(txtotal - rxtotal) <= txtotal*.10, -- 10% tolerance + "Too little packets received") +end + +function selftest_duplex (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source_a", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "source_b", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmacb, + dst = xdpmaca + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-1 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source_a.output"..queue.." -> "..queue_a..".input") + config.link(c, "source_b.output"..queue.." -> "..queue_b..".input") + config.link(c, queue_a..".output -> sink.input_a"..queue) + config.link(c, queue_b..".output -> sink.input_b"..queue) + end + engine.configure(c) + print("kernel_has_ring_flags", XDP.kernel_has_ring_flags) + engine.main{ duration=1 } + engine.report_links() + for label, stream in ipairs{ + ['a->b'] = {'a','b'}, + ['b->a'] = {'b','a'} + } do + local txtotal, rxtotal = 0, 0 + for queue = 0, nqueues-1 do + local tx = link.stats(engine.app_table["source_"..stream[0]].output["output_"..queue]) + local rx = link.stats(engine.app_table.sink.input["input_"..stream[1]..queue]) + assert(tx.rxpackets > 0, "["..label"..] No packets sent on queue: "..queue) + assert(rx.rxpackets > 0, "["..label"..] No packets received on queue: "..queue) + txtotal = txtotal + tx.rxpackets + rxtotal = rxtotal + rx.rxpackets + end + assert(math.abs(txtotal - rxtotal) <= txtotal*.10, -- 10% tolerance + "["..label"..] Too little packets received") + end +end + +function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) + local c = config.new() + local synth = require("apps.test.synth") + local npackets = require("apps.test.npackets") + local match = require("apps.test.match") + config.app(c, "source", synth.Synth, { + sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,1501}, + src = xdpmaca, + dst = xdpmacb, + random_payload = true + }) + config.app(c, "npackets", npackets.Npackets, {npackets=1000}) + config.app(c, "match", match.Match) + config.app(c, xdpdeva, XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb, XDP, {ifname=xdpdevb}) + config.link(c, "source.output -> "..xdpdeva..".input") + config.link(c, xdpdevb..".output -> match.rx") + config.link(c, "source.copy -> npackets.input") + config.link(c, "npackets.output -> match.comparator") + engine.configure(c) + engine.main{ duration=.1 } + engine.report_links() + engine.report_apps() + assert(#engine.app_table.match:errors() == 0, "Match errors.") +end + +function selftest_share_interface_worker (xdpdev, queue) + snabb_enable_xdp() + local c = config.new() + local basic = require("apps.basic.basic_apps") + local recv = xdpdev.."_q"..queue + config.app(c, recv, XDP, { + ifname = xdpdev, + queue = queue + }) + config.app(c, "sink", basic.Sink) + config.link(c, recv..".output -> sink.input") + engine.configure(c) + engine.main{ duration=.1, no_report = true } + print("[worker links]") + engine.report_links() + assert(link.stats(engine.app_table.sink.input.input).rxpackets > 0, + "No packets received on "..recv.." in worker.") +end + +function selftest_share_interface (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local worker = require("core.worker") + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-2 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source.output"..queue.." -> "..queue_a..".input") + config.link(c, queue_b..".output -> sink.input"..queue) + end + engine.configure(c) + worker.start('worker', ("require('apps.xdp.xdp').selftest_share_interface_worker('%s', %d)") + :format(xdpdevb, nqueues-1)) + engine.main{ done=function () return not worker.status().worker.alive end, + no_report = true } + local worker_status = worker.status().worker.status + print("[parent links]") + engine.report_links() + if worker_status ~= 0 then + os.exit(worker_status) + end +end diff --git a/src/core/packet.lua b/src/core/packet.lua index 783eff5106..efb2d24c43 100644 --- a/src/core/packet.lua +++ b/src/core/packet.lua @@ -25,11 +25,11 @@ max_payload = tonumber(C.PACKET_PAYLOAD_SIZE) -- For operations that add or remove headers from the beginning of a -- packet, instead of copying around the payload we just move the -- packet structure as a whole around. -local packet_alignment = 512 -local default_headroom = 256 +packet_alignment = 512 +default_headroom = 256 -- The Intel82599 driver requires even-byte alignment, so let's keep -- things aligned at least this much. -local minimum_alignment = 2 +minimum_alignment = 2 local function get_alignment (addr, alignment) -- Precondition: alignment is a power of 2. @@ -266,7 +266,7 @@ end function from_string (d) return from_pointer(d, #d) end -- Free a packet that is no longer in use. -local function free_internal (p) +function free_internal (p) local ptr = ffi.cast("char*", p) p = ffi.cast(packet_ptr_t, ptr - get_headroom(ptr) + default_headroom) p.length = 0