Skip to content

Commit

Permalink
Merge/stream synchronization (#349)
Browse files Browse the repository at this point in the history
* Updated project_generation_scripts and tf_merge_streamer.vhd to synchronize bx with output, synchronize bx 0 with other bxs, and synchronize reading of memories by tf_merge_streamer

* Remove commented out code and fix tabs

* Added optimizations to help meet timing: changed tf_merge_streamer and transitioned TPAR memories to URAM

* Incorporate suggestions from Jason (change if to reverse loop)

* Merged project_generation_scripts PR
  • Loading branch information
mcoshiro authored Oct 28, 2024
1 parent c139d6d commit 7f01c3c
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 140 deletions.
5 changes: 3 additions & 2 deletions IntegrationTests/common/hdl/tf_mem.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ entity tf_mem is
INIT_HEX : boolean := true; --! Read init file in hex (default) or bin
RAM_PERFORMANCE : string := "HIGH_PERFORMANCE";--! Select "HIGH_PERFORMANCE" (2 clk latency) or "LOW_LATENCY" (1 clk latency)
NAME : string := "MEMNAME"; --! Name of mem for printout
DEBUG : boolean := false --! If true prints debug info
DEBUG : boolean := false; --! If true prints debug info
MEM_TYPE : string := "block" --! specifies RAM type (block/ultra)
);
port (
clka : in std_logic; --! Write clock
Expand Down Expand Up @@ -101,7 +102,7 @@ signal sv_RAM_row : std_logic_vector(RAM_WIDTH-1 downto 0) := (others =>'0');

-- ########################### Attributes ###########################
attribute ram_style : string;
attribute ram_style of sa_RAM_data : signal is "block";
attribute ram_style of sa_RAM_data : signal is MEM_TYPE;

begin

Expand Down
324 changes: 187 additions & 137 deletions IntegrationTests/common/hdl/tf_merge_streamer.vhd
Original file line number Diff line number Diff line change
@@ -1,137 +1,187 @@
--===========================================================================
--! @file
--! @brief Module which reads and streams out the contents of the memories
--! at the end of the first half of the TF algo.
--! @author Jason Fan ([email protected])
--! @date 2024-02-29
--! @version v.1.0
--===========================================================================

--! Standard library
library ieee;
--! Standard package
use ieee.std_logic_1164.all;
--! Signed/unsigned calculations
use ieee.numeric_std.all;
--! Standard functions
library std;
--! Standard TextIO functions
use std.textio.all;

--! Xilinx library
library unisim;
--! Xilinx package
use unisim.vcomponents.all;
use work.tf_pkg.all;

entity tf_merge_streamer is
generic (
RAM_WIDTH : natural := 72;
NUM_PAGES : natural := 8;
RAM_DEPTH : natural := NUM_PAGES * PAGE_LENGTH;
NUM_INPUTS : natural := 4;
NUM_EXTRA_BITS: natural := 2;
ADDR_WIDTH : natural := 7
);
port (
bx_in : in std_logic_vector(2 downto 0 );
rst: in std_logic;
clk : in std_logic;
--output read enable to tf_mem modules
enb_arr: out std_logic_vector(NUM_INPUTS-1 downto 0);
bx_out : out std_logic_vector(2 downto 0);
--output merged stream, includes input word, up to 2 bits that encode the
--original module, and a valid bit (from LSB to MSB)
merged_dout : out std_logic_vector(RAM_WIDTH+NUM_EXTRA_BITS downto 0);
--input data,nent and addresses are best suited for unconstrained arrays
--but this is not supported in vivado 2019
--module always accepts 4 input memories, but will not use all of them
din0: in std_logic_vector(RAM_WIDTH-1 downto 0);
din1: in std_logic_vector(RAM_WIDTH-1 downto 0);
din2: in std_logic_vector(RAM_WIDTH-1 downto 0);
din3: in std_logic_vector(RAM_WIDTH-1 downto 0);
nent0: in t_arr_7b(0 to NUM_PAGES-1);
nent1: in t_arr_7b(0 to NUM_PAGES-1);
nent2: in t_arr_7b(0 to NUM_PAGES-1);
nent3: in t_arr_7b(0 to NUM_PAGES-1);
addr_arr: out std_logic_vector(NUM_INPUTS*CLOGB2(RAM_DEPTH)-1 downto 0)
) ;
end entity tf_merge_streamer;

architecture RTL of tf_merge_streamer is

constant MAX_INPUTS : integer := 4;
constant pipe_stages : integer := 4;

type mem_count_arr is array(NUM_INPUTS-1 downto 0) of integer;
type toread_arr is array(pipe_stages-1 downto 0) of integer;

--nent and din are repackaged from odd input type into
--arrays
type nent_array is array(MAX_INPUTS-1 downto 0) of t_arr_7b(0 to NUM_PAGES-1);
type din_array is array(MAX_INPUTS-1 downto 0) of std_logic_vector(RAM_WIDTH-1 downto 0);

signal valid : std_logic_vector(pipe_stages-1 downto 0) := (others => '0');
signal readmask : std_logic_vector(NUM_INPUTS-1 downto 0) := (others => '0');

begin
process(clk)
variable nent_arr: nent_array;
variable din_arr: din_array;
variable bx_last :integer :=0;
variable mem_count : mem_count_arr := (others => 0);
variable current_page: natural := 0;
variable bx_change : boolean := false; -- indicates to the module whether or not the bx has changed compared to the previous clock

variable toread : toread_arr := (others => 0);

begin
if rising_edge(clk) then
nent_arr := (nent3,nent2,nent1,nent0); --repackage nent and din as arrays
din_arr := (din3, din2, din1, din0);
bx_change := (bx_last /= to_integer(unsigned(bx_in)));
if (bx_change) then --reset with rst signal or a change in bx
-- check if bx changes and update page to read from
mem_count := (others => 0);
end if ;
current_page := to_integer(unsigned(bx_in)) mod NUM_PAGES;
--check if memory read counter is less than nentries
for i in 0 to NUM_INPUTS-1 loop
if ((mem_count(i)) < to_integer(unsigned(nent_arr(i)(current_page)))) then
readmask(i) <= '1';
else
readmask(i) <= '0';
end if;
end loop;
if (to_integer(unsigned(readmask)) = 0) then
valid(0) <= '0';
else
for j in 0 to NUM_INPUTS-1 loop
if readmask((j + toread(0) + 1) mod NUM_INPUTS) = '1' then
toread(0) := (j + toread(0) + 1 ) mod NUM_INPUTS;
exit;
end if;
end loop;
addr_arr(((toread(0)+1)*clogb2(RAM_DEPTH))-1 downto (toread(0))*clogb2(RAM_DEPTH)) <= std_logic_vector(to_unsigned(current_page*page_length + mem_count(toread(0)), clogb2(RAM_DEPTH)));
valid(0) <= '1';
mem_count(toread(0)) := mem_count(toread(0)) + 1;
end if;

if valid(pipe_stages-1) ='1' then
if (NUM_EXTRA_BITS > 0) then
merged_dout <= '1' & std_logic_vector(to_unsigned(toread(pipe_stages-1),NUM_EXTRA_BITS)) & din_arr(toread(pipe_stages-1));
else
merged_dout <= '1' & din_arr(toread(pipe_stages-2));
end if ;
else
merged_dout <= (others => '0');
end if;
bx_last := to_integer(unsigned(bx_in));
bx_out <= bx_in;
for j in 0 to pipe_stages-2 loop
toread(j+1) := toread(j);
valid(j+1) <= valid(j);
end loop;
end if;
end process;
end RTL;
--===========================================================================
--! @file
--! @brief Module which reads and streams out the contents of the memories
--! at the end of the first half of the TF algo.
--! @author Jason Fan ([email protected])
--! @date 2024-02-29
--! @version v.1.1
--===========================================================================

--! Standard library
library ieee;
--! Standard package
use ieee.std_logic_1164.all;
--! Signed/unsigned calculations
use ieee.numeric_std.all;
--! Standard functions
library std;
--! Standard TextIO functions
use std.textio.all;

--! Xilinx library
library unisim;
--! Xilinx package
use unisim.vcomponents.all;
use work.tf_pkg.all;

entity tf_merge_streamer is
generic (
RAM_WIDTH: natural := 72;
NUM_PAGES : natural := 8;
RAM_DEPTH : natural := NUM_PAGES * PAGE_LENGTH;
NUM_INPUTS : natural := 4;
NUM_EXTRA_BITS: natural := 2;
ADDR_WIDTH : natural := 7
);
port (
bx_in : in std_logic_vector(2 downto 0 );
bx_in_vld : in std_logic;
rst: in std_logic;
clk : in std_logic;
--output read enable to tf_mem modules
enb_arr: out std_logic_vector(NUM_INPUTS-1 downto 0);
bx_out : out std_logic_vector(2 downto 0);
--output merged stream, includes input word, up to 2 bits that encode the
--original module, and a valid bit (from LSB to MSB)
merged_dout : out std_logic_vector(RAM_WIDTH+NUM_EXTRA_BITS downto 0);
--input data,nent and addresses are best suited for unconstrained arrays
--but this is not supported in vivado 2019
--module always accepts 4 input memories, but will not use all of them
din0: in std_logic_vector(RAM_WIDTH-1 downto 0);
din1: in std_logic_vector(RAM_WIDTH-1 downto 0);
din2: in std_logic_vector(RAM_WIDTH-1 downto 0);
din3: in std_logic_vector(RAM_WIDTH-1 downto 0);
nent0: in t_arr_7b(0 to NUM_PAGES-1);
nent1: in t_arr_7b(0 to NUM_PAGES-1);
nent2: in t_arr_7b(0 to NUM_PAGES-1);
nent3: in t_arr_7b(0 to NUM_PAGES-1);
addr_arr: out std_logic_vector(NUM_INPUTS*CLOGB2(RAM_DEPTH)-1 downto 0)
) ;
end entity tf_merge_streamer;

architecture RTL of tf_merge_streamer is

constant MAX_INPUTS : integer := 4;
constant pipe_stages : integer := 3;
constant LOG2_RAM_DEPTH : integer := CLOGB2(RAM_DEPTH);

type mem_count_arr is array(MAX_INPUTS-1 downto 0) of integer;
type toread_arr is array(pipe_stages-1 downto 0) of integer range 0 to 3;
type bx_arr is array(pipe_stages downto 0) of std_logic_vector(2 downto 0);
type addr_arr_arr is array(MAX_INPUTS-1 downto 0) of std_logic_vector(LOG2_RAM_DEPTH-1 downto 0);

--nent and din are repackaged from odd input type into
--arrays
type nent_array is array(MAX_INPUTS-1 downto 0) of t_arr_7b(0 to NUM_PAGES-1);
type din_array is array(MAX_INPUTS-1 downto 0) of std_logic_vector(RAM_WIDTH-1 downto 0);

signal valid : std_logic_vector(pipe_stages-1 downto 0) := (others => '0');
signal bx_pipe : bx_arr := (others => (others => '0'));
signal addr_arr_int : addr_arr_arr := (others => (others => '0'));
signal bx_last : std_logic_vector(2 downto 0) := "111";
signal bx_in_latch : std_logic_vector(2 downto 0) := "111"; --since output triggered by BX change, initializing bx_in_latch to 7 will start write on first valid bx (0)
signal mem_count : mem_count_arr := (others => 0);
signal toread : toread_arr := (others => 0);
signal current_page: natural := 7 mod NUM_PAGES;
signal readmask : std_logic_vector(MAX_INPUTS-1 downto 0) := (others => '0');

begin
process(clk)
variable nent_arr: nent_array;
variable din_arr: din_array;
variable bx_change : boolean := false; -- indicates to the module whether or not the bx has changed compared to the previous clock
variable nextread : integer range 0 to 3 := 0;
variable mem_count_next : mem_count_arr := (others => 0);

begin
if rising_edge(clk) then
if (bx_in_vld = '1') then
bx_in_latch <= bx_in;
current_page <= to_integer(unsigned(bx_in)) mod NUM_PAGES;
end if;

nent_arr := (nent3,nent2,nent1,nent0); --repackage nent and din as arrays
din_arr := (din3, din2, din1, din0);
bx_change := (bx_last /= bx_in_latch);

if (bx_change) then --reset with rst signal or a change in bx
mem_count <= (others => 0);
toread(0) <= (NUM_INPUTS-1) mod NUM_INPUTS;
valid(0) <= '0';

--check if memory read counter is less than nentries
--this sets readmask to 1 for any inputs that still have words to read
for i in 0 to NUM_INPUTS-1 loop
if (0 < to_integer(unsigned(nent_arr(i)(current_page)))) then
readmask(i) <= '1';
else
readmask(i) <= '0';
end if;
end loop;

else
--only check for valid reads on non BX change clocks
--this gives up a clock cycle, but reduces logic levels downstream

for i in 0 to NUM_INPUTS-1 loop
mem_count_next(i) := mem_count(i);
end loop;

if (to_integer(unsigned(readmask)) = 0) then
valid(0) <= '0';
else
valid(0) <= '1';
--loop through starting with the next input in front of the current to-read (round-robin)
for i in 0 to 3 loop
if (readmask((toread(0) - i) mod 4) = '1') then
nextread := (toread(0) - i) mod 4;
end if;
end loop;
addr_arr_int(nextread) <= std_logic_vector(to_unsigned(current_page*page_length + mem_count(nextread), LOG2_RAM_DEPTH));
mem_count(nextread) <= mem_count(nextread) + 1;
toread(0) <= nextread;
mem_count_next(nextread) := mem_count_next(nextread)+1;
end if;

--check if memory read counter is less than nentries
--this sets readmask to 1 for any inputs that still have words to read
for i in 0 to NUM_INPUTS-1 loop
if ((mem_count_next(i)) < to_integer(unsigned(nent_arr(i)(current_page)))) then
readmask(i) <= '1';
else
readmask(i) <= '0';
end if;
end loop;

end if ;

--generate output a few clocks after address is set to account for delay in RAMs
if valid(pipe_stages-1) ='1' then
if (NUM_EXTRA_BITS > 0) then
merged_dout <= '1' & std_logic_vector(to_unsigned(toread(pipe_stages-1),NUM_EXTRA_BITS)) & din_arr(toread(pipe_stages-1));
else
merged_dout <= '1' & din_arr(toread(pipe_stages-1));
end if ;
else
merged_dout <= (others => '0');
end if;

bx_last <= bx_in_latch;
bx_pipe(0) <= bx_in_latch;
bx_out <= bx_pipe(pipe_stages);
for j in pipe_stages-2 downto 0 loop
valid(j+1) <= valid(j);
toread(j+1) <= toread(j);
end loop;
for j in pipe_stages-1 downto 0 loop
bx_pipe(j+1) <= bx_pipe(j);
end loop;
end if;
end process;

GEN_ADDR: for i in 0 to NUM_INPUTS-1 generate
begin
addr_arr(LOG2_RAM_DEPTH*(i+1)-1 downto LOG2_RAM_DEPTH*i) <= addr_arr_int(i);
end generate;

end RTL;
2 changes: 1 addition & 1 deletion emData/project_generation_scripts

0 comments on commit 7f01c3c

Please sign in to comment.