From 328ba098e9d87da46a99ce1c9b5fdb0ceedbf5a6 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 8 Mar 2022 15:37:59 +0100 Subject: [PATCH 01/13] apps.mellanox.connectx_test: make sure links are up in basic_match --- src/apps/mellanox/connectx_test.lua | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index cba693ddcd..6eed44836c 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -316,6 +316,12 @@ function basic_match (pci0, pci1) engine.configure(c) + print("waiting for linkup...") + lib.waitfor(function () + return engine.app_table.nic0.hca:linkup() + and engine.app_table.nic1.hca:linkup() + end) + engine.main({duration = 1, report = false}) engine.report_links() engine.report_apps() From 3cd72c244db043c5042beb88838288df33043859 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 27 Jan 2022 00:23:36 +0100 Subject: [PATCH 02/13] apps.mellanox: packetblaster --- src/apps/mellanox/connectx.lua | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 9a9b2e1184..3173c36a30 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -1328,6 +1328,17 @@ function IO:new (conf) close() end + -- Configure self as packetblaster? + if conf.packetblaster then + self.push = nil + self.pull = function (self) + if activate() then + sq:blast(self.input.input or self.input.rx) + deactivate() + end + end + end + return self end @@ -1505,6 +1516,72 @@ function SQ:new (cxq, mmio) end end + -- Packetblaster: blast packets from link out of send queue. + function sq:blast (l) + local kickoff = sq:blast_load(l) + + -- Get current send queue tail (hardware controlled) + local opcode = cxq.scq[0].u8[0x38] + if opcode == 0x0A then + local wqeid = shr(bswap(cxq.scq[0].u32[0x3C/4]), 16) + + -- Keep send queue topped up + local next_slot = slot(cxq.next_tx_wqeid) + while next_slot ~= slot(wqeid) do + local wqe = cxq.swq[next_slot] + -- Update control segment + wqe.u32[0] = bswap(shl(cxq.next_tx_wqeid, 8) + 0x0A) + -- Advance counters + cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 + next_slot = slot(cxq.next_tx_wqeid) + ring = true + end + end + + if opcode == 0x0A or kickoff then + -- Ring the doorbell + local current_packet = slot(cxq.next_tx_wqeid + mask) + cxq.doorbell.send = bswap(cxq.next_tx_wqeid) + cxq.bf_next[0] = cxq.swq[current_packet].u64[0] + -- Switch next/alternate blue flame register for next time + cxq.bf_next, cxq.bf_alt = cxq.bf_alt, cxq.bf_next + end + end + + -- Packetblaster: load packets from link into send queue. + local loaded = 0 + function sq:blast_load (l) + while loaded < cxq.sqsize and not link.empty(l) do + local p = link.receive(l) + local next_slot = slot(cxq.next_tx_wqeid) + local wqe = cxq.swq[next_slot] + + -- Construct a 64-byte transmit descriptor. + -- This is in three parts: Control, Ethernet, Data. + -- The Ethernet part includes some inline data. + + -- Control segment + wqe.u32[0] = bswap(shl(cxq.next_tx_wqeid, 8) + 0x0A) + wqe.u32[1] = bswap(shl(cxq.sqn, 8) + 4) + wqe.u32[2] = bswap(shl(2, 2)) -- completion always + -- Ethernet segment + local ninline = 16 + wqe.u32[7] = bswap(shl(ninline, 16)) + ffi.copy(wqe.u8 + 0x1E, p.data, ninline) + -- Send Data Segment (inline data) + wqe.u32[12] = bswap(p.length - ninline) + wqe.u32[13] = bswap(cxq.rlkey) + local phy = memory.virtual_to_physical(p.data + ninline) + wqe.u32[14] = bswap(tonumber(shr(phy, 32))) + wqe.u32[15] = bswap(tonumber(band(phy, 0xFFFFFFFF))) + -- Advance counters + cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 + loaded = loaded + 1 + -- Kickoff? + return loaded == cxq.sqsize + end + end + return sq end From 60ce041c4a1e238be8b54a10655743511e2a70c7 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 27 Jan 2022 11:35:36 +0100 Subject: [PATCH 03/13] apps.mellanox: cleanup and optimization --- src/apps/mellanox/connectx.lua | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 3173c36a30..bec7d08285 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -149,7 +149,7 @@ local cxq_t = ffi.typeof([[ // Transmit state struct packet *tx[64*1024]; // packets queued for transmit - uint16_t next_tx_wqeid; // work queue ID for next transmit descriptor + uint16_t next_tx_wqeid; // work queue ID for next transmit descriptor uint64_t *bf_next, *bf_alt; // "blue flame" to ring doorbell (alternating) // Receive state @@ -567,8 +567,7 @@ function ConnectX:new (conf) process_fn = function(r, stats) -- Incremental update relies on query_q_counter to -- clear the counter after read. - counter.set(stats.rxdrop, - counter.read(stats.rxdrop) + r.out_of_buffer) + counter.add(stats.rxdrop, r.out_of_buffer) end }) end @@ -1385,7 +1384,7 @@ function RQ:new (cxq) function rq:receive (l) local limit = engine.pull_npackets - while have_input() and limit > 0 and not link.full(l) do + while limit > 0 and have_input() do -- Find the next completion entry. local c = cxq.rcq[cxq.next_rx_cqeid] limit = limit - 1 @@ -1394,23 +1393,23 @@ function RQ:new (cxq) cxq.next_rx_cqeid = slot(cxq.next_rx_cqeid + 1) -- Toggle the ownership value if the CQ wraps around. if cxq.next_rx_cqeid == 0 then - cxq.rx_mine = (cxq.rx_mine + 1) % 2 + cxq.rx_mine = band(cxq.rx_mine + 1, 1) end -- Decode the completion entry. local opcode = shr(c.u8[0x3F], 4) local len = bswap(c.u32[0x2C/4]) local wqeid = shr(bswap(c.u32[0x3C/4]), 16) local idx = slot(wqeid) - if opcode == 0 or opcode == 2 then + if band(opcode, 0xfd) == 0 then -- opcode == 0 or opcode == 2 -- Successful receive local p = cxq.rx[idx] - assert(p ~= nil) + -- assert(p ~= nil) p.length = len link.transmit(l, p) cxq.rx[idx] = nil elseif opcode == 13 or opcode == 14 then -- Error on receive - assert(cxq.rx[idx] ~= nil) + -- assert(cxq.rx[idx] ~= nil) packet.free(cxq.rx[idx]) cxq.rx[idx] = nil local syndromes = { @@ -1433,10 +1432,6 @@ function RQ:new (cxq) end end - function rq:ring_doorbell () - doorbell[0].receive = bswap(next_buffer) - end - return rq end @@ -1493,7 +1488,7 @@ function SQ:new (cxq, mmio) end -- Ring the doorbell if we enqueued new packets. if cxq.next_tx_wqeid ~= start_wqeid then - local current_packet = slot(cxq.next_tx_wqeid + cxq.sqsize-1) + local current_packet = slot(cxq.next_tx_wqeid + mask) cxq.doorbell.send = bswap(cxq.next_tx_wqeid) cxq.bf_next[0] = cxq.swq[current_packet].u64[0] -- Switch next/alternate blue flame register for next time @@ -1507,11 +1502,11 @@ function SQ:new (cxq, mmio) local opcode = cxq.scq[0].u8[0x38] if opcode == 0x0A then local wqeid = shr(bswap(cxq.scq[0].u32[0x3C/4]), 16) - while next_reclaim ~= wqeid % cxq.sqsize do - assert(cxq.tx[next_reclaim] ~= nil) + while next_reclaim ~= slot(wqeid) do + -- assert(cxq.tx[next_reclaim] ~= nil) packet.free(cxq.tx[next_reclaim]) cxq.tx[next_reclaim] = nil - next_reclaim = tonumber(slot(next_reclaim + 1)) + next_reclaim = slot(next_reclaim + 1) end end end From 66ab5b1b559b070a3b2dfc097f7791a684c89a36 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 28 Jan 2022 16:58:40 +0100 Subject: [PATCH 04/13] apps.mellanox: remove dead code from packetblaster --- src/apps/mellanox/connectx.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index bec7d08285..bba64961f0 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -1529,7 +1529,6 @@ function SQ:new (cxq, mmio) -- Advance counters cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 next_slot = slot(cxq.next_tx_wqeid) - ring = true end end From 64a62f23810f2d88d55259e3ba826320017a0213 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 17 Jan 2022 09:44:58 +0100 Subject: [PATCH 05/13] apps.mellanox: Make RX/TX flow control configurable, default to off/off --- src/apps/mellanox/connectx.lua | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index bba64961f0..60542dbf9a 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -265,6 +265,9 @@ function ConnectX:new (conf) hca:set_port_mtu(mtu) hca:modify_nic_vport_context(mtu, true, true, true) + hca:set_port_flow_control(conf.fc_rx_enable == nil and false or conf.fc_rx_enable, + conf.fc_tx_enable == nil and false or conf.fc_tx_enable) + -- Create basic objects that we need -- local uar = hca:alloc_uar() @@ -1741,6 +1744,7 @@ end PMTU = 0x5003 PTYS = 0x5004 -- Port Type and Speed PAOS = 0x5006 -- Port Administrative & Operational Status +PFCC = 0x5007 -- Port Flow Control Configuration PPCNT = 0x5008 -- Ports Performance Counters PPLR = 0x5018 -- Port Physical Loopback Register @@ -1910,6 +1914,37 @@ function HCA:get_port_stats_finish () return port_stats end +function HCA:set_port_flow_control (rx_enable, tx_enable) + self:command("ACCESS_REGISTER", 0x1C, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PFCC) + :input("local_port", 0x10, 23, 16, 1) + :input("pptx", 0x10 + 0x08, 31, 31, tx_enable and 1 or 0) + :input("pprx", 0x10 + 0x0C, 31, 31, rx_enable and 1 or 0) + :execute() +end + +local fc_status = {} +function HCA:get_port_flow_control () + self:command("ACCESS_REGISTER", 0x10, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PFCC) + :input("local_port", 0x10, 23, 16, 1) + :execute() + fc_status.pptx = self:output(0x10 + 0x08, 31, 31) + fc_status.aptx = self:output(0x10 +0x08, 30, 30) + fc_status.pfctx = self:output(0x10 + 0x08, 23, 16) + fc_status.fctx_disabled = self:output(0x10 +0x08, 8, 8) + fc_status.pprx = self:output(0x10 + 0x0c, 31, 31) + fc_status.aprx = self:output(0x10 + 0x0c, 30, 30) + fc_status.pfcrx = self:output(0x10 +0x0c, 23, 16) + fc_status.stall_minor_watermark = self:output(0x10 +0x10, 31, 16) + fc_status.stall_crit_watermark = self:output(0x10 +0x10, 15, 0) + return fc_status +end + function HCA:alloc_q_counter() self:command("ALLOC_Q_COUNTER", 0x18, 0x10C) :input("opcode", 0x00, 31, 16, 0x771) From 4c1571fcddda2ac4062d1a05377f80c792f978dd Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 17 Jan 2022 10:23:25 +0100 Subject: [PATCH 06/13] apps.mellanox: Add per-queue drop stats For each configured queue, add a stats counter rxdrop_ that reflects the hardware per-queue drop counter where is the queue id in the ConnectX configuration as defined by the user. # Conflicts: # src/apps/mellanox/connectx.lua --- src/apps/mellanox/connectx.lua | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 60542dbf9a..002b3da185 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -283,8 +283,9 @@ function ConnectX:new (conf) local rqlist = {} local rqs = {} - -- List of queue counter IDs (ConnectX5 and up) - local counter_set_ids = {} + -- List of queue counter IDs and their corresponding queue IDs from + -- the configuration (ConnectX5 and up) + local q_counters = {} -- Enable MAC/VLAN switching? local usemac = false @@ -333,7 +334,8 @@ function ConnectX:new (conf) local counter_set_id if self.mlx > 4 then counter_set_id = hca:alloc_q_counter() - table.insert(counter_set_ids, counter_set_id) + table.insert(q_counters, { counter_id = counter_set_id, + queue_id = queue.id }) end -- XXX order check cxq.sqn = hca:create_sq(scqn, pd, sq_stride, sendq_size, @@ -516,6 +518,11 @@ function ConnectX:new (conf) txdrop = {counter}, txerrors = {counter}, } + -- Create per-queue drop counters named by the queue identifiers in + -- the configuration. + for _, queue in ipairs(conf.queues) do + frame["rxdrop_"..queue.id] = {counter} + end self.stats = shm.create_frame("pci/"..pciaddress, frame) -- Create separate HCAs to retreive port statistics. Those @@ -561,16 +568,18 @@ function ConnectX:new (conf) } -- Empty for ConnectX4 - for _, id in ipairs(counter_set_ids) do + for _, q_counter in ipairs(q_counters) do + local per_q_rxdrop = self.stats["rxdrop_"..q_counter.queue_id] table.insert(self.stats_reqs, { start_fn = HCA.query_q_counter_start, finish_fn = HCA.query_q_counter_finish, - args = { set_id = id }, + args = q_counter.counter_id, process_fn = function(r, stats) -- Incremental update relies on query_q_counter to -- clear the counter after read. counter.add(stats.rxdrop, r.out_of_buffer) + counter.add(per_q_rxdrop, r.out_of_buffer) end }) end @@ -1955,13 +1964,13 @@ end local q_stats = { out_of_buffer = 0ULL } -function HCA:query_q_counter_start (args) +function HCA:query_q_counter_start (id) self:command("QUERY_Q_COUNTER", 0x20, 0x10C) :input("opcode", 0x00, 31, 16, 0x773) -- Clear the counter after reading. This allows us to -- update the rxdrop stat incrementally. :input("clear", 0x18, 31, 31, 1) - :input("counter_set_id",0x1c, 7, 0, args.set_id) + :input("counter_set_id",0x1c, 7, 0, id) :execute_async() end From 69a15b93dd29c08fbdac06094a4312b0ee92fc8d Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 21 Jan 2022 10:52:49 +0100 Subject: [PATCH 07/13] apps.mellanox: Use lib.parse in ConnectX:new() and IO:new() # Conflicts: # src/apps/mellanox/connectx.lua --- src/apps/mellanox/connectx.lua | 40 ++++++++++++++++++++++++----- src/apps/mellanox/connectx_test.lua | 6 +++-- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 002b3da185..359f153399 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -209,17 +209,38 @@ local mlx_types = { ["0x101d" ] = 6, -- ConnectX6 } +ConnectX.config = { + pciaddress = { required = true }, + sendq_size = { default = 1024 }, + recvq_size = { default = 1024 }, + mtu = { default = 9500 }, + fc_rx_enable = { default = false }, + fc_tx_enable = { default = false }, + queues = { required = true }, + macvlan = { default = false }, +} +local queue_config = { + id = { required = true }, + mac = { default = nil }, + vlan = { default = nil }, +} + function ConnectX:new (conf) local self = setmetatable({}, self) + local queues = {} + for _, queue in ipairs(conf.queues) do + table.insert(queues, lib.parse(queue, queue_config)) + end + local pciaddress = pci.qualified(conf.pciaddress) local device_info = pci.device_info(pciaddress) self.mlx = assert(mlx_types[device_info.device], "Unsupported device "..device_info.device) - local sendq_size = conf.sendq_size or 1024 - local recvq_size = conf.recvq_size or 1024 + local sendq_size = conf.sendq_size + local recvq_size = conf.recvq_size - local mtu = conf.mtu or 9500 + local mtu = conf.mtu -- Perform a hard reset of the device to bring it into a blank state. -- @@ -265,8 +286,7 @@ function ConnectX:new (conf) hca:set_port_mtu(mtu) hca:modify_nic_vport_context(mtu, true, true, true) - hca:set_port_flow_control(conf.fc_rx_enable == nil and false or conf.fc_rx_enable, - conf.fc_tx_enable == nil and false or conf.fc_tx_enable) + hca:set_port_flow_control(conf.fc_rx_enable, conf.fc_tx_enable) -- Create basic objects that we need -- @@ -1253,6 +1273,12 @@ IO.__index = IO -- lib.hardware.pci.device_info driver = IO +IO.config = { + pciaddress = {required=true}, + queue = {required=true}, + packetblaster = {default=false} +} + function IO:new (conf) local self = setmetatable({}, self) @@ -2475,8 +2501,8 @@ function selftest () io1.output = { output = link.new('output1') } -- Exercise the IO apps before the NIC is initialized. io0:pull() io0:push() io1:pull() io1:push() - local nic0 = ConnectX:new{pciaddress = pcidev0, queues = {{id='a'}}} - local nic1 = ConnectX:new{pciaddress = pcidev1, queues = {{id='b'}}} + local nic0 = ConnectX:new(lib.parse({pciaddress = pcidev0, queues = {{id='a'}}}, ConnectX.config)) + local nic1 = ConnectX:new(lib.parse({pciaddress = pcidev1, queues = {{id='b'}}}, ConnectX.config)) print("selftest: waiting for both links up") while (nic0.hca:query_vport_state().oper_state ~= 1) or diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 6eed44836c..7c4d79bafb 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -40,8 +40,10 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs end end -- Instantiate app network - local nic0 = connectx.ConnectX:new({pciaddress=pci0, queues=queues}) - local nic1 = connectx.ConnectX:new({pciaddress=pci1, queues=queues}) + local nic0 = connectx.ConnectX:new(lib.parse({pciaddress=pci0, queues=queues}, + connectx.ConnectX.config)) + local nic1 = connectx.ConnectX:new(lib.parse({pciaddress=pci1, queues=queues}, + connectx.ConnectX.config)) local io0 = {} -- io apps on nic0 local io1 = {} -- io apps on nic1 print(("creating %d queues per device..."):format(#queues)) From 3a5b5aec0b43a02a0e03f267fc8afe7f12385c18 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Tue, 18 Jan 2022 10:51:44 +0100 Subject: [PATCH 08/13] apps.mellanox: Use consumer counter in cxq to simplify ownership handling Rename cxq.next_rx_cqeid to cxq.rx_cqcc (completion queue consumer counter) and expand to 32 bit. This counter is not wrapped around the size of the cq. It is used to calculate the value of the SW ownership bit directly with efficient bitops and eliminates the need for the rx_mine member of cxq. # Conflicts: # src/apps/mellanox/connectx.lua --- src/apps/mellanox/connectx.lua | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 359f153399..959f9eaf54 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -154,9 +154,8 @@ local cxq_t = ffi.typeof([[ // Receive state struct packet *rx[64*1024]; // packets queued for receive - uint16_t next_rx_wqeid; // work queue ID for next receive descriptor - uint16_t next_rx_cqeid; // completion queue ID of next completed packet - int rx_mine; // CQE ownership value that means software-owned + uint16_t next_rx_wqeid; // work queue ID for next receive descriptor + uint32_t rx_cqcc; // consumer counter of RX CQ } ]]) @@ -1388,9 +1387,11 @@ function RQ:new (cxq) local rq = {} local mask = cxq.rqsize - 1 - -- Return the transmit queue slot for the given WQE ID. - local function slot (wqeid) - return band(wqeid, mask) + -- Return the queue slot for the given consumer counter for either + -- the CQ or the WQ. This assumes that both queues have the same + -- size. + local function slot (cc) + return band(cc, mask) end -- Refill with buffers @@ -1414,25 +1415,29 @@ function RQ:new (cxq) end end + local log2_rqsize = log2size(cxq.rqsize) + local function sw_owned () + -- The value of the ownership flag that indicates owned by SW for + -- the current consumer counter is flipped every time the counter + -- wraps around the receive queue. + return band(shr(cxq.rx_cqcc, log2_rqsize), 1) + end + local function have_input () - local c = cxq.rcq[cxq.next_rx_cqeid] + local c = cxq.rcq[slot(cxq.rx_cqcc)] local owner = bit.band(1, c.u8[0x3F]) - return owner == cxq.rx_mine + return owner == sw_owned() end function rq:receive (l) local limit = engine.pull_npackets while limit > 0 and have_input() do -- Find the next completion entry. - local c = cxq.rcq[cxq.next_rx_cqeid] + local c = cxq.rcq[slot(cxq.rx_cqcc)] limit = limit - 1 -- Advance to next completion. -- Note: assumes sqsize == cqsize - cxq.next_rx_cqeid = slot(cxq.next_rx_cqeid + 1) - -- Toggle the ownership value if the CQ wraps around. - if cxq.next_rx_cqeid == 0 then - cxq.rx_mine = band(cxq.rx_mine + 1, 1) - end + cxq.rx_cqcc = cxq.rx_cqcc + 1 -- Decode the completion entry. local opcode = shr(c.u8[0x3F], 4) local len = bswap(c.u32[0x2C/4]) From be7a343d4d6089b0d52acb5a55063885292bb67a Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Sun, 23 Jan 2022 19:58:10 +0100 Subject: [PATCH 09/13] connectx.lua: Add basic event handling The event queue is polled at the frequency of sync_timer(). The following events are supported: * CQError. Prints the CQ number and syndrome, then aborts. * PortStateChange. Prints the port number and new state (up/down). This could be used to replace the get_port_status() call in sync_stats() * PageRequest. Allocates/deallocates the requested number of pages. --- src/apps/mellanox/connectx.lua | 144 ++++++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 27 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 959f9eaf54..87c9131828 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -647,6 +647,7 @@ function ConnectX:new (conf) function self:pull () if self.sync_timer() then self:sync_stats() + eq:poll() end end @@ -821,6 +822,17 @@ function HCA:alloc_pages (num_pages) self:execute() end +function HCA:free_pages (num_pages) + assert(num_pages > 0) + self:command("MANAGE_PAGES", 0x0C, 0x10 + num_pages*8) + :input("opcode", 0x00, 31, 16, 0x108) + :input("opmod", 0x04, 15, 0, 2) -- return pages + :input("input_num_entries", 0x0C, 31, 0, num_pages, "input_num_entries") + :execute() + local num_entries = self:output(0x08, 31, 0) + -- TODO: deallocate DMA pages +end + -- Query the NIC capabilities (maximum or current setting). function HCA:query_hca_general_cap (max_or_current) local opmod = assert(({max=0, current=1})[max_or_current]) @@ -914,67 +926,145 @@ end -- Event queues --------------------------------------------------------------- +-- Event Queue Entry (EQE) +local eqe_t = ffi.typeof([[ + struct { + uint8_t reserved1; + uint8_t event_type; + uint8_t reserved2; + uint8_t event_sub_type; + uint32_t reserved3[7]; + uint32_t event_data[7]; + uint16_t reserved4; + uint8_t signature; + uint8_t owner; + } +]]) + -- Create an event queue that can be accessed via the given UAR page number. function HCA:create_eq (uar) local numpages = 1 local log_eq_size = 7 -- 128 entries - local ptr, phy = memory.dma_alloc(4096, 4096) -- memory for entries + local byte_size = 2^log_eq_size * ffi.sizeof(eqe_t) + local ptr, phy = memory.dma_alloc(byte_size, 4096) -- memory for entries + events = bits({ + CQError = 0x04, + PortStateChange = 0x09, + PageRequest = 0x0B, + }) self:command("CREATE_EQ", 0x10C + numpages*8, 0x0C) :input("opcode", 0x00, 31, 16, 0x301) + :input("oi", 0x10 + 0x00, 17, 17, 1) -- overrun ignore :input("log_eq_size", 0x10 + 0x0C, 28, 24, log_eq_size) :input("uar_page", 0x10 + 0x0C, 23, 0, uar) :input("log_page_size", 0x10 + 0x18, 28, 24, 2) -- XXX best value? 0 or max? - :input("event bitmask", 0x10 + 0x5C, 31, 0, bits({PageRequest=0xB})) -- XXX more events? + :input("event bitmask", 0x5C, 31, 0, events) :input("pas[0] high", 0x110, 31, 0, ptrbits(phy, 63, 32)) :input("pas[0] low", 0x114, 31, 0, ptrbits(phy, 31, 0)) :execute() local eqn = self:output(0x08, 7, 0) - return eq:new(eqn, ptr, 2^log_eq_size) + return eq:new(eqn, ptr, log_eq_size, self) end --- Event Queue Entry (EQE) -local eqe_t = ffi.typeof([[ - struct { - uint16_t event_type; - uint16_t event_sub_type; - uint32_t event_data; - uint16_t pad; - uint8_t signature; - uint8_t owner; - } ]] ) - eq = {} eq.__index = eq -- Create event queue object. -function eq:new (eqn, pointer, nentries) +function eq:new (eqn, pointer, log_size, hca) + local nentries = 2^log_size local ring = ffi.cast(ffi.typeof("$*", eqe_t), pointer) - for i = 0, nentries-1 do + for i = 0, nentries - 1 do + -- Owner = HW ring[i].owner = 1 end + local mask = nentries - 1 return setmetatable({eqn = eqn, ring = ring, index = 0, - n = nentries}, + log_size = log_size, + mask = nentries - 1, + hca = hca, + }, self) end +function eq:sw_value () + return band(shr(self.index, self.log_size), 1) +end + +function eq:entry () + local slot = band(self.index, self.mask) + return self.ring[slot] +end + -- Poll the queue for events. -function eq:poll() - print("Polling EQ") - local eqe = self.ring[self.index] - while eqe.owner == 0 and eqe.event_type ~= 0xFF do +function eq:poll () + local eqe = self:entry() + while eqe.owner == self:sw_value() do + self:handle_event(eqe) self.index = self.index + 1 - eqe = self.ring[self.index % self.n] - self:event(eqe) + eqe = self:entry() end - print("done polling EQ") end -- Handle an event. -function eq:event () - print(("Got event %s.%s"):format(eqe.event_type, eqe.event_sub_type)) - error("Event handling not yet implemented") +local event_page_req = ffi.cdef([[ + struct event_page_req { + uint16_t reserved1; + uint16_t function_id; + uint32_t num_pages; + uint32_t reserved2[5]; + } +]]) +local event_port_change = ffi.cdef([[ + struct event_port_change { + uint32_t reserved1[2]; + uint8_t port_num; + uint8_t reserved2[3]; + uint32_t reserved2[4]; + } +]]) +local port_status = { + [1] = "down", + [4] = "up" +} +local event_cq_error = ffi.cdef([[ + struct event_cq_error { + uint32_t cqn; + uint32_t reserved1; + uint8_t reserved2[3]; + uint8_t syndrome; + uint32_t reserved3[4]; + } +]]) +local cq_errors = { + [1] = "overrun", + [2] = "access violation" +} +function eq:handle_event (eqe) + if eqe.event_type == 0x04 then + local cq_error = cast(typeof("struct event_cq_error *"), eqe.event_data) + local cqn = bswap(cq_error.cqn) + error(("Error on completion queue #%d: %s"):format(cqn, cq_errors[cq_error.syndrome])) + elseif eqe.event_type == 0x09 then + local port_change = cast(typeof("struct event_port_change *"), eqe.event_data) + local port = shr(port_change.port_num, 4) + print(("Port %d changed state to %s"):format(port, port_status[eqe.event_sub_type])) + elseif eqe.event_type == 0xB then + local page_req = cast(typeof("struct event_page_req *"), eqe.event_data) + local num_pages = bswap(page_req.num_pages) + if num_pages < 0 then + num_pages = -num_pages + print(("Reclaiming %d pages from HW"):format(num_pages)) + self.hca:free_pages(num_pages) + else + print(("Allocating %d pages to HW"):format(num_pages)) + self.hca:alloc_pages(num_pages) + end + else + error(("Received unexpected event type 0x%02x, subtype 0x%02x"):format(eqe.event_type, + eqe.event_sub_type)) + end end --------------------------------------------------------------- From ee2a7033819514cac146a03f54e36837e1a6760f Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 17 Mar 2022 13:57:14 +0100 Subject: [PATCH 10/13] apps.connectx: hide event prints behind debug_info flag --- src/apps/mellanox/connectx.lua | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 87c9131828..14fdde2472 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -50,6 +50,7 @@ local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot local cast, typeof = ffi.cast, ffi.typeof +local debug_info = false -- Print into messages local debug_trace = false -- Print trace messages local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) @@ -810,6 +811,9 @@ end -- Provide the NIC with freshly allocated memory. function HCA:alloc_pages (num_pages) assert(num_pages > 0) + if debug_info then + print(("Allocating %d pages to HW"):format(num_pages)) + end self:command("MANAGE_PAGES", 0x14 + num_pages*8, 0x0C) :input("opcode", 0x00, 31, 16, 0x108) :input("opmod", 0x04, 15, 0, 1) -- allocate mode @@ -824,6 +828,9 @@ end function HCA:free_pages (num_pages) assert(num_pages > 0) + if debug_info then + print(("Reclaiming %d pages from HW"):format(num_pages)) + end self:command("MANAGE_PAGES", 0x0C, 0x10 + num_pages*8) :input("opcode", 0x00, 31, 16, 0x108) :input("opmod", 0x04, 15, 0, 2) -- return pages @@ -1047,18 +1054,18 @@ function eq:handle_event (eqe) local cqn = bswap(cq_error.cqn) error(("Error on completion queue #%d: %s"):format(cqn, cq_errors[cq_error.syndrome])) elseif eqe.event_type == 0x09 then - local port_change = cast(typeof("struct event_port_change *"), eqe.event_data) - local port = shr(port_change.port_num, 4) - print(("Port %d changed state to %s"):format(port, port_status[eqe.event_sub_type])) + if debug_info then + local port_change = cast(typeof("struct event_port_change *"), eqe.event_data) + local port = shr(port_change.port_num, 4) + print(("Port %d changed state to %s"):format(port, port_status[eqe.event_sub_type])) + end elseif eqe.event_type == 0xB then local page_req = cast(typeof("struct event_page_req *"), eqe.event_data) local num_pages = bswap(page_req.num_pages) if num_pages < 0 then num_pages = -num_pages - print(("Reclaiming %d pages from HW"):format(num_pages)) self.hca:free_pages(num_pages) else - print(("Allocating %d pages to HW"):format(num_pages)) self.hca:alloc_pages(num_pages) end else From 3d8ddccc07cd0164d3e365163d4fb928424f85e6 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 21 Jun 2022 15:56:15 +0000 Subject: [PATCH 11/13] lib.numa: do NUMA migrations only for >1 node systems This works around an issue in lwaftr where workers hang on NUMA migrations. Possibly, this only fixes the issue on flat NUMA systems. See 89c48fc670de1c98c611456a7642793a95b867ab --- src/lib/numa.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/numa.lua b/src/lib/numa.lua index f3ab02418a..67b554c80e 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -216,7 +216,7 @@ function bind_to_cpu (cpu, skip_perf_checks) end function unbind_numa_node () - if supports_numa() then + if has_numa() then assert(S.set_mempolicy('default')) end bound_numa_node = nil @@ -227,7 +227,7 @@ function bind_to_numa_node (node, policy) if not node then return unbind_numa_node() end assert(not bound_numa_node, "already bound") - if supports_numa() then + if has_numa() then assert(S.set_mempolicy(policy or 'preferred', node)) -- Migrate any pages that might have the wrong affinity. From cf73e1e01c401d1cfdd96454dc1be7450a0b7cd6 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 21 Jun 2022 16:03:27 +0000 Subject: [PATCH 12/13] apps.mellanox: make per-queue counter creation configurable Currenty, we create HCAs per stats request, and create indepedent stats requests for each per-queue counter set. HCAs are a limited resource, and many-queue configurations might run out of HCAs. As a workaround, allow counter set creation to be disabled per-queue. --- src/apps/mellanox/connectx.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 14fdde2472..3961fce297 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -223,6 +223,7 @@ local queue_config = { id = { required = true }, mac = { default = nil }, vlan = { default = nil }, + enable_counters = { default = true }, } function ConnectX:new (conf) @@ -352,7 +353,7 @@ function ConnectX:new (conf) -- Create the queue objects local tis = hca:create_tis(0, tdomain) local counter_set_id - if self.mlx > 4 then + if self.mlx > 4 and queue.enable_counters then counter_set_id = hca:alloc_q_counter() table.insert(q_counters, { counter_id = counter_set_id, queue_id = queue.id }) From d67f63dd9b98380b11c1d802154480438585d942 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 21 Jun 2022 16:05:21 +0000 Subject: [PATCH 13/13] lwaftr: updated ConnectX config defaults - larger send/receive queues - force flow controll off - limit per-queue counters to not exceed HCA capacity --- src/program/lwaftr/setup.lua | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 2387fbe0f3..572033408d 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -261,22 +261,30 @@ function config_connectx(c, name, opt, lwconfig) end local device = lwutil.parse_instance(lwconfig) local queues = {} + local queue_counters, queue_counters_max = 0, 24 for id, queue in pairs(lwconfig.softwire_config.instance[device].queue) do + queue_counters = queue_counters + 2 queues[#queues+1] = { id = queue_id(queue.external_interface, id), mac = ethernet:ntop(queue.external_interface.mac), - vlan = queue.external_interface.vlan_tag + vlan = queue.external_interface.vlan_tag, + enable_counters = queue_counters <= queue_counters_max } queues[#queues+1] = { id = queue_id(queue.internal_interface, id), mac = ethernet:ntop(queue.internal_interface.mac), - vlan = queue.internal_interface.vlan_tag + vlan = queue.internal_interface.vlan_tag, + enable_counters = queue_counters <= queue_counters_max } end if lwutil.is_lowest_queue(lwconfig) then config.app(c, "ConnectX_"..opt.pci:gsub("[%.:]", "_"), connectx.ConnectX, { pciaddress = opt.pci, - queues = queues + queues = queues, + sendq_size = 4096, + recvq_size = 4096, + fc_rx_enable = false, + fc_tx_enable = false }) end config.app(c, name, connectx.IO, {