Skip to content

Commit

Permalink
WIP: [MemPool-Spatz]
Browse files Browse the repository at this point in the history
1. Change gnu toolchain version for vector support.
2. Various bug fixes for Spatz IPU.
3. Various bug fixes for runtime library.
  • Loading branch information
msc23h24 Diyou Shen (dishen) committed Nov 27, 2023
1 parent 434ec0c commit fca9e9b
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 48 deletions.
2 changes: 1 addition & 1 deletion Bender.lock
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ packages:
dependencies:
- common_cells
spatz:
revision: efec12edf249a7fc8819b00ca5452c553f446675
revision: 51c88bfa3287b0206165d5edb4a31a2c3e23ab94
version: null
source:
Git: git@iis-git.ee.ethz.ch:spatz/spatz.git
Expand Down
2 changes: 1 addition & 1 deletion Bender.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies:
reqrsp_interface: { path: "hardware/deps/reqrsp_interface" }
snitch: { path: "hardware/deps/snitch" }
tech_cells_generic: { git: "/~https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.5 }
spatz: { git: "git@iis-git.ee.ethz.ch:spatz/spatz.git", rev: efec12e }
spatz: { git: "git@iis-git.ee.ethz.ch:spatz/spatz.git", rev: 51c88bf }
FPnew: { git: "/~https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 }

workspace:
Expand Down
2 changes: 1 addition & 1 deletion config/mempool_spatz4.mk
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ axi_masters_per_group ?= 1
spatz ?= 1

# Lenght of single vector register
vlen ?= 256
vlen ?= 512

# Number of IPUs
n_ipu ?= 4
Expand Down
7 changes: 5 additions & 2 deletions hardware/deps/snitch/src/snitch_md.sv
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ module snitch_md
/// Enable div/sqrt unit (buggy - use with caution)
parameter bit XDivSqrt = 0,
parameter int RegNrWritePorts = 2, // Implement one or two write ports into the register file

parameter type acc_issue_rsp_t = logic,
// Dependant parameters.
localparam bit FP_EN = RVF || RVD // Enable FP in general
Expand Down Expand Up @@ -2247,6 +2246,7 @@ module snitch_md
acc_qvalid_o = valid_instr;
ls_size = Word;
is_fp_store = 1'b1;
acc_mem_store = 1'b1;
end else begin
illegal_inst = 1'b1;
end
Expand All @@ -2272,6 +2272,7 @@ module snitch_md
acc_qvalid_o = valid_instr;
ls_size = Double;
is_fp_store = 1'b1;
acc_mem_store = 1'b1;
end else begin
illegal_inst = 1'b1;
end
Expand All @@ -2297,6 +2298,7 @@ module snitch_md
acc_qvalid_o = valid_instr;
ls_size = HalfWord;
is_fp_store = 1'b1;
acc_mem_store = 1'b1;
end else begin
illegal_inst = 1'b1;
end
Expand All @@ -2322,6 +2324,7 @@ module snitch_md
acc_qvalid_o = valid_instr;
ls_size = Byte;
is_fp_store = 1'b1;
acc_mem_store = 1'b1;
end else begin
illegal_inst = 1'b1;
end
Expand Down Expand Up @@ -2688,7 +2691,7 @@ module snitch_md
// address can be alu_result (i.e. rs1 + iimm/simm) or rs1 (for post-increment load/stores)
assign lsu_qaddr = is_postincr ? gpr_rdata[0] : alu_result;

assign lsu_qvalid = valid_instr & (is_load | is_store) & ~(ld_addr_misaligned | st_addr_misaligned);
assign lsu_qvalid = valid_instr & (is_load | is_store) & ~(ld_addr_misaligned | st_addr_misaligned) & ~acc_mem_stall;

// NOTE(smazzola): write-backs "on rd from non-load or non-acc instructions" and "on rs1 from
// post-increment instructions" in the same cycle should be mutually exclusive (currently valid
Expand Down
4 changes: 2 additions & 2 deletions hardware/src/mempool_tile.sv
Original file line number Diff line number Diff line change
Expand Up @@ -804,8 +804,8 @@ module mempool_tile
remote_req_interco[idx].wdata.data}),
.tcdm_req_amo_o({local_req_interco_payload[idx].wdata.amo,
remote_req_interco[idx].wdata.amo}),
.tcdm_req_id_o({local_req_interco_payload[idx]
.wdata.meta_id, remote_req_interco[idx].wdata.meta_id}),
.tcdm_req_id_o({local_req_interco_payload[idx].wdata.meta_id,
remote_req_interco[idx].wdata.meta_id}),
.tcdm_req_be_o ({local_req_interco_payload[idx].be, remote_req_interco[idx].be}),
.tcdm_req_ready_i ({local_req_interco_ready[idx], remote_req_interco_ready[idx]} ),
.tcdm_resp_valid_i({local_resp_interco_valid[idx], remote_resp_interco_valid[idx]}),
Expand Down
13 changes: 13 additions & 0 deletions software/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ SHELL = /usr/bin/env bash
ROOT_DIR := $(patsubst %/,%, $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
MEMPOOL_DIR := $(shell git rev-parse --show-toplevel 2>/dev/null || echo $$MEMPOOL_DIR)
APPS_DIR := $(ROOT_DIR)/apps
BIN_DIR := $(ROOT_DIR)/bin
HALIDE_DIR := $(ROOT_DIR)/halide
RUNTIME_DIR := $(ROOT_DIR)/runtime
TESTS_DIR := $(ROOT_DIR)/riscv-tests/isa
TOOLCHAIN_DIR := $(abspath $(ROOT_DIR)/../toolchain)
# This will overwrite the ROOT_DIR variable from the included makefile
include $(RUNTIME_DIR)/runtime.mk
include $(TESTS_DIR)/snitch_isa.mk
include $(TESTS_DIR)/spatz.mk

# Applications
.PHONY: apps
Expand Down Expand Up @@ -50,14 +52,25 @@ $(eval $(call rtl_mempool_tests_template,rv32ui))
$(eval $(call rtl_mempool_tests_template,rv32um))
$(eval $(call rtl_mempool_tests_template,rv32ua))
$(eval $(call rtl_mempool_tests_template,rv32uxpulpimg))
$(eval $(call rtl_mempool_tests_template,rv32uf))

TESTS_rv32uv := $(addprefix $(BIN_DIR)/, $(rv32uv_spatz_tests))

$(BIN_DIR)/rv32uv-spatz-%: $(TESTS_DIR)/rv32uv/%.c $(RUNTIME) $(LINKER_SCRIPT)
mkdir -p $(BIN_DIR)
$(RISCV_CC) -Iinclude -I$(TESTS_DIR)/macros/scalar -I$(TESTS_DIR)/macros/vector $(RISCV_LDFLAGS_TESTS) -o $@ $< $(RUNTIME) -T$(RUNTIME_DIR)/link.ld
$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump

test: update_opcodes $(TESTS)
test_spatz: update_opcodes $(TESTS_rv32uv)

clean-test:
rm -vf $(RUNTIME)
rm -vf $(LINKER_SCRIPT)
rm -vf $(TESTS)
rm -vf $(addsuffix .dump,$(TESTS))
rm -vf $(TESTS_rv32uv)
rm -vf $(addsuffix .dump,$(TESTS_rv32uv))

# Helper targets
update_opcodes:
Expand Down
20 changes: 19 additions & 1 deletion software/apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,24 @@ SOFTWARE_DIR := $(abspath $(ROOT_DIR)/..)
RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime)
BIN_DIR := $(abspath $(SOFTWARE_DIR)/bin)
APPS_DIR := $(ROOT_DIR)
TESTS_DIR := $(abspath $(SOFTWARE_DIR)/riscv-tests/isa)
# This will overwrite the ROOT_DIR variable from the included makefile
include $(RUNTIME_DIR)/runtime.mk


APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find $(APPS_DIR) -name "main.c"))
DATA := $(patsubst %.args,%.h,$(shell find $(APPS_DIR) -name "data.args"))
ALLPYS := $(patsubst %.py,%.h,$(wildcard $(RUNTIME_DIR)/data/*.py))

ifeq ($(spatz), 1)
APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find -L $(APPS_DIR) -name "main.c"))
# Remove MemPool-specific Spatz kernels
ifeq ($(rvd), 1)
APPS := $(filter-out spatz_apps/mempool%,$(APPS))
endif
else
APPS := $(patsubst $(APPS_DIR)/%/main.c,%,$(shell find -L $(APPS_DIR) -name "main.c" -not -path "*/spatz_apps/*"))
endif
BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
ifeq ($(config), systolic)
ALL := $(APPS)
Expand All @@ -37,6 +49,7 @@ $(APPS): % : $(BIN_DIR)/% $(APPS_DIR)/Makefile $(shell find $(RUNTIME_DIR)/**.{S

.PHONY: $(BINARIES)
$(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLPYS) update_opcodes
echo $(APPS)
mkdir -p $(dir $@)
$(RISCV_CC) -Iinclude $(RISCV_LDFLAGS) -o $@ $< $(RUNTIME) -T$(RUNTIME_DIR)/link.ld
$(RISCV_OBJDUMP) $(RISCV_OBJDUMP_FLAGS) -D $@ > $@.dump
Expand All @@ -45,12 +58,17 @@ $(BINARIES): $(BIN_DIR)/%: %/main.c.o $(RUNTIME) $(LINKER_SCRIPT) $(DATA) $(ALLP
update_opcodes:
make -C $(MEMPOOL_DIR) update_opcodes

.PHONY: clean
.PHONY: clean clean_bin
clean:
rm -vf $(BINARIES)
rm -vf $(SPATZ_BINARIES)
rm -vf $(addsuffix .dump,$(BINARIES))
rm -vf $(addsuffix .dump,$(SPATZ_BINARIES))
rm -vf $(addsuffix /main.c.o,$(APPS))
rm -vf $(RUNTIME)
rm -vf $(LINKER_SCRIPT)

clean_bin:
rm -vrf $(BIN_DIR)/*

.INTERMEDIATE: $(addsuffix /main.c.o,$(APPS))
72 changes: 52 additions & 20 deletions software/runtime/crt0.S
Original file line number Diff line number Diff line change
Expand Up @@ -53,32 +53,64 @@ _reset_vector:
li x29, 0
li x30, 0
li x31, 0
la sp, __stack_start // load stack
csrr a0, mhartid // get hart id
#if N_FPU != 0
fmv.s.x f0, zero
fmv.s.x f1, zero
fmv.s.x f2, zero
fmv.s.x f3, zero
fmv.s.x f4, zero
fmv.s.x f5, zero
fmv.s.x f6, zero
fmv.s.x f7, zero
fmv.s.x f8, zero
fmv.s.x f9, zero
fmv.s.x f10, zero
fmv.s.x f11, zero
fmv.s.x f12, zero
fmv.s.x f13, zero
fmv.s.x f14, zero
fmv.s.x f15, zero
fmv.s.x f16, zero
fmv.s.x f17, zero
fmv.s.x f18, zero
fmv.s.x f19, zero
fmv.s.x f20, zero
fmv.s.x f10, zero
fmv.s.x f21, zero
fmv.s.x f22, zero
fmv.s.x f23, zero
fmv.s.x f24, zero
fmv.s.x f25, zero
fmv.s.x f26, zero
fmv.s.x f27, zero
fmv.s.x f28, zero
fmv.s.x f29, zero
fmv.s.x f30, zero
fmv.s.x f31, zero
#endif
la sp, __stack_start // load stack
csrr a0, mhartid // get hart id
// Calculate sequential region offset for our tile
srli t0, a0, LOG2_NUM_CORES_PER_TILE // tile_id = id / NUM_CORES_PER_TILE
slli t0, t0, (LOG2_NUM_CORES_PER_TILE+LOG2_SEQ_MEM_SIZE) // tile_offset = tile_id * NUM_CORES_PER_TILE * SEQ_MEM_SIZE
srli t0, a0, LOG2_NUM_CORES_PER_TILE+LOG2_N_FU // tile_id = id / (NUM_CORES_PER_TILE * N_FU)
slli t0, t0, (LOG2_NUM_CORES_PER_TILE+LOG2_N_FU+LOG2_SEQ_MEM_SIZE) // tile_offset = tile_id * NUM_CORES_PER_TILE * N_FU * SEQ_MEM_SIZE
// Calculate stack offset within tile
li t1, NUM_CORES_PER_TILE // NUM_CORES_PER_TILE
addi t1, t1, -1 // Create mask for NUM_CORES_PER_TILE
and t1, a0, t1 // tile_core_id = id % 4
addi t1, t1, 1 // tile_core_id += 1
slli t1, t1, LOG2_STACK_SIZE // tile_core_offset = tile_core_id * STACK_SIZE
addi t1, t1, -4 // tile_core_offset -= 1 word
li t1, NUM_CORES_PER_TILE * N_FU // NUM_CORES_PER_TILE * N_FU
addi t1, t1, -1 // Create mask for NUM_CORES_PER_TILE
and t1, a0, t1 // tile_core_id = id % 4
addi t1, t1, 1 // tile_core_id += 1
slli t1, t1, LOG2_STACK_SIZE // tile_core_offset = tile_core_id * STACK_SIZE
addi t1, t1, -4 // tile_core_offset -= 1 word
// Calculate final stack pointer
add t0, t0, t1 // offset = tile_offset + tile_core_offset
add sp, sp, t0 // sp += offset
add t0, t0, t1 // offset = tile_offset + tile_core_offset
add sp, sp, t0 // sp += offset
// Add offset in case we have hardware queues at the beginning of sequential memory
li t0, XQUEUE_SIZE // XQUEUE_SIZE (in words)
slli t0, t0, (4+2) // XQUEUE_SIZE * BANKS_PER_TILE * BYTES_PER_WORD
add sp, sp, t0 // offset += 16 * xqueue_size * 4
// Write the stack limit into the dedicated CSR
addi t0, sp, -(STACK_SIZE-4) // stack_limit = sp - (STACK_SIZE - 1)
csrw stacklimit, t0 // write stack limit into CSR
li t0, XQUEUE_SIZE // XQUEUE_SIZE (in words)
slli t0, t0, (4+2) // XQUEUE_SIZE * BANKS_PER_TILE * BYTES_PER_WORD
add sp, sp, t0 // offset += 16 * xqueue_size * 4
// Configure the RO cache or directly jump to main
bnez a0, _jump_main
la t0, ro_cache_end_0 // Get peripheral register to set cacheable region
la t1, _erodata // Write the end of the read-only data to be cacheable
la t0, ro_cache_end_0 // Get peripheral register to set cacheable region
la t1, _erodata // Write the end of the read-only data to be cacheable
sw t1, 0(t0)
_jump_main:
call main
Expand Down
Loading

0 comments on commit fca9e9b

Please sign in to comment.