antonblanchard · mikey · Apr 9, 2024 · Mar 11, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/countbits.vhdl b/countbits.vhdl
@@ -50,9 +50,11 @@ architecture behaviour of bit_counter is
 begin
     countzero_r: process(clk)
     begin
-        if rising_edge(clk) and stall = '0' then
-            inp_r <= inp;
-            sum_r <= sum;
+        if rising_edge(clk) then
+            if stall = '0' then
+                inp_r <= inp;
+                sum_r <= sum;
+            end if;
         end if;
     end process;
 

diff --git a/fetch1.vhdl b/fetch1.vhdl
@@ -102,9 +102,6 @@ architecture behaviour of fetch1 is
     signal itlb_pte : tlb_pte_t;
     signal itlb_hit : std_ulogic;
 
-    -- Privilege bit from PTE EAA field
-    signal eaa_priv  : std_ulogic;
-
     -- Simple hash for direct-mapped TLB index
     function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
         variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
@@ -155,7 +152,7 @@ begin
         attribute ram_style of btc_memory : signal is "block";
 
         signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0);
-        attribute ram_style of btc_valids : signal is "distributed";
+        -- attribute ram_style of btc_valids : signal is "distributed";
 
         signal btc_wr : std_ulogic;
         signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0);

diff --git a/fpga/arty_a7.xdc b/fpga/arty_a7.xdc
@@ -171,15 +171,15 @@ set_property -dict { PACKAGE_PIN R15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_po
 set_property -dict { PACKAGE_PIN P15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io33 }];
 set_property -dict { PACKAGE_PIN R16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io34 }];
 set_property -dict { PACKAGE_PIN N16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io35 }];
-set_property -dict { PACKAGE_PIN N14 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io36 }];
-set_property -dict { PACKAGE_PIN U17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io37 }];
-set_property -dict { PACKAGE_PIN T18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io38 }];
-set_property -dict { PACKAGE_PIN R18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io39 }];
-set_property -dict { PACKAGE_PIN P18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io40 }];
-set_property -dict { PACKAGE_PIN N17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io41 }];
-set_property -dict { PACKAGE_PIN M17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io42 }]; # A
-set_property -dict { PACKAGE_PIN L18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io43 }]; # SCL
-set_property -dict { PACKAGE_PIN M18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io44 }]; # SDA
+#set_property -dict { PACKAGE_PIN N14 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io36 }];
+#set_property -dict { PACKAGE_PIN U17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io37 }];
+#set_property -dict { PACKAGE_PIN T18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io38 }];
+#set_property -dict { PACKAGE_PIN R18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io39 }];
+#set_property -dict { PACKAGE_PIN P18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io40 }];
+#set_property -dict { PACKAGE_PIN N17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io41 }];
+#set_property -dict { PACKAGE_PIN M17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io42 }]; # A
+#set_property -dict { PACKAGE_PIN L18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io43 }]; # SCL
+#set_property -dict { PACKAGE_PIN M18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io44 }]; # SDA
 #set_property -dict { PACKAGE_PIN C2  IOSTANDARD LVCMOS33 } [get_ports { shield_rst }];
 
 #set_property -dict { PACKAGE_PIN C1  IOSTANDARD LVCMOS33 } [get_ports { spi_hdr_ss }];

diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl
@@ -206,6 +206,9 @@ architecture behaviour of toplevel is
     signal ddram_clk_p_vec : std_logic_vector(0 downto 0);
     signal ddram_clk_n_vec : std_logic_vector(0 downto 0);
 
+    signal uart1_rxd : std_ulogic;
+    signal uart1_txd : std_ulogic;
+
     -- Fixup various memory sizes based on generics
     function get_bram_size return natural is
     begin
@@ -266,8 +269,8 @@ begin
             uart0_rxd         => uart_main_rx,
 
 	    -- UART1 signals
-	    --uart1_txd         => uart_pmod_tx,
-	    --uart1_rxd         => uart_pmod_rx,
+            uart1_txd         => uart1_txd,
+            uart1_rxd         => uart1_rxd,
 
             -- SPI signals
             spi_flash_sck     => spi_sck,
@@ -302,7 +305,7 @@ begin
             wishbone_dma_out     => wb_sddma_out
             );
 
-    --uart_pmod_rts_n <= '0';
+    uart1_txd <= '1';
 
     -- SPI Flash
     --
@@ -415,8 +418,9 @@ begin
                 );
 
         -- Generate SoC reset
-        soc_rst_gen: process(system_clk)
+        soc_rst_gen: process(system_clk, ext_rst_n)
         begin
+            -- XXX why does this need to be an asynchronous reset?
             if ext_rst_n = '0' then
                 soc_rst <= '1';
             elsif rising_edge(system_clk) then

diff --git a/fpu.vhdl b/fpu.vhdl
@@ -953,7 +953,6 @@ begin
             v.denorm := '0';
             v.is_subtract := '0';
             v.add_bsmall := '0';
-            v.doing_ftdiv := "00";
             v.int_ovf := '0';
             v.div_close := '0';
 
@@ -1007,7 +1006,7 @@ begin
         elsif new_exp < min_exp then
             exp_tiny := '1';
         end if;
-	if is_X(new_exp) or is_X(min_exp) then
+	if is_X(new_exp) or is_X(max_exp) then
 	    exp_huge := 'X';
 	elsif new_exp > max_exp then
             exp_huge := '1';
@@ -1038,6 +1037,7 @@ begin
 
         v.update_fprf := '0';
         v.first := '0';
+        v.doing_ftdiv := "00";
         v.opsel_a := AIN_R;
         opsel_ainv <= '0';
         opsel_mask <= '0';
@@ -1147,8 +1147,10 @@ begin
                 v.instr_done := '1';
 
             when DO_FTDIV =>
-                v.instr_done := '1';
                 v.cr_result := "0000";
+                -- set result_exp to the exponent of B
+                re_sel2 <= REXP2_B;
+                re_set_result <= '1';
                 if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
                     (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then
                     v.cr_result(2) := '1';
@@ -1157,6 +1159,7 @@ begin
                     r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or
                     (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) then
                     v.cr_result(1) := '1';
+                    v.instr_done := '1';
                 else
                     v.doing_ftdiv := "11";
                     v.first := '1';
@@ -1173,7 +1176,7 @@ begin
                 end if;
                 if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
                     or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then
-                    v.cr_result(1) := '0';
+                    v.cr_result(1) := '1';
                 end if;
 
             when DO_FCMP =>
@@ -2148,6 +2151,9 @@ begin
                 v.state := NORMALIZE;
 
             when FTDIV_1 =>
+                -- We go through this state up to two times; the first sees if
+                -- B.exponent is in the range [-1021,1020], and the second tests
+                -- whether B.exp - A.exp is in the range [-1022,1020].
                 v.cr_result(1) := exp_tiny or exp_huge;
                 -- set shift to a.exp
                 rs_sel2 <= RSH2_A;

diff --git a/icache.vhdl b/icache.vhdl
@@ -403,12 +403,12 @@ begin
             variable snoop_addr : real_addr_t;
             variable next_raddr : real_addr_t;
         begin
-            replace_way := to_unsigned(0, WAY_BITS);
-            if NUM_WAYS > 1 then
-                -- Get victim way from plru
-                replace_way := plru_victim;
-            end if;
             if rising_edge(clk) then
+                replace_way := to_unsigned(0, WAY_BITS);
+                if NUM_WAYS > 1 then
+                    -- Get victim way from plru
+                    replace_way := plru_victim;
+                end if;
                 -- Read tags using NIA for next cycle
                 if flush_in = '1' or i_in.req = '0' or (stall_in = '0' and stall_out = '0') then
                     next_raddr := i_in.next_rpn & i_in.next_nia(MIN_LG_PGSZ - 1 downto 0);
@@ -649,6 +649,7 @@ begin
     begin
         if rising_edge(clk) then
             ev.icache_miss <= '0';
+            ev.itlb_miss_resolved <= '0';
             r.recv_valid <= '0';
 	    -- On reset, clear all valid bits to force misses
             if rst = '1' then

diff --git a/microwatt.core b/microwatt.core
@@ -62,14 +62,13 @@ filesets:
       - fpga/pp_soc_uart.vhd
       - fpga/pp_utilities.vhd
       - fpga/firmware.hex : {copyto : firmware.hex, file_type : user}
+      - nonrandom.vhdl
     file_type : vhdlSource-2008
 
   xilinx_specific:
     files:
       - xilinx-mult.vhdl : {file_type : vhdlSource-2008}
       - xilinx-mult-32s.vhdl : {file_type : vhdlSource-2008}
-      - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008}
-      - fpga/fpga-random.xdc : {file_type : xdc}
 
   debug_xilinx:
     files:

diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
@@ -1665,6 +1665,65 @@ int fpu_test_25(void)
 	return 0;
 }
 
+struct ftvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	int cr_ftdiv;
+	int cr_ftsqrt;
+} ftvals[] = {
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0, 0 },
+	{ 0x0000000000000000, 0x3ff0000000000000, 0, 6 },
+	{ 0xfff0000000000000, 0x3ff0000000000000, 6, 6 },
+	{ 0x7ff1234560000000, 0x3ff0000000000000, 2, 2 },
+	{ 0x3ff0000000000000, 0xfff0000000000000, 6, 0 },
+	{ 0x3ff0000000000000, 0x8000000000000000, 6, 0 },
+	{ 0x3ff0000000000000, 0x7ff9234560000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x0020000000000000, 0, 0 },
+	{ 0x3ff0000000000000, 0x0010000000000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x0001000000000000, 6, 0 },
+	{ 0x3ff0000000000000, 0x7fb1234500000000, 0, 0 },
+	{ 0x3ff0000000000000, 0x7fc1234500000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x7fd1234500000000, 2, 0 },
+	{ 0x3ff0000000000000, 0x7fe1234500000000, 2, 0 },
+	{ 0x6000000000000000, 0x2000000000000000, 2, 0 },
+	{ 0x5ff0000000000000, 0x2000000000000000, 2, 0 },
+	{ 0x5fe0000000000000, 0x2000000000000000, 0, 0 },
+	{ 0x2000000000000000, 0x5fc0000000000000, 0, 0 },
+	{ 0x2000000000000000, 0x5fd0000000000000, 2, 0 },
+	{ 0x0360000000000000, 0x4320000000000000, 0, 0 },
+	{ 0x0350000000000000, 0x4310000000000000, 2, 2 },
+	{ 0x0010000000000000, 0x3fd0000000000000, 2, 2 },
+	{ 0x0001000000000000, 0x3fd0000000000000, 2, 6 },
+	{ 0xbff0000000000000, 0x3ff0000000000000, 0, 2 },
+	{ 0x3fd0000000000000, 0x0001000000000000, 6, 0 },
+};
+
+int test26(long arg)
+{
+	long i;
+	int cr;
+	struct ftvals *vp = ftvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(ftvals) / sizeof(ftvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%1); lfd 6,8(%1); ftdiv 5,5,6; ftsqrt 4,5; mfcr %0" :
+		    "=r" (cr) : "b" (&vp->val_a) : "cr4", "cr5");
+		if (((cr >> 8) & 0xf) != vp->cr_ftdiv ||
+		    ((cr >> 12) & 0x1f) != vp->cr_ftsqrt) {
+			print_hex(i, 2, " ");
+			print_hex(cr, 8, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_26(void)
+{
+	enable_fp();
+	return trapit(0, test26);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1715,6 +1774,7 @@ int main(void)
 	do_test(23, fpu_test_23);
 	do_test(24, fpu_test_24);
 	do_test(25, fpu_test_25);
+	do_test(26, fpu_test_26);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
@@ -23,3 +23,4 @@ test 22:PASS
 test 23:PASS
 test 24:PASS
 test 25:PASS
+test 26:PASS
diff --git a/xics.vhdl b/xics.vhdl
@@ -386,15 +386,14 @@ begin
     reg_write: process(clk)
         variable be_in  : std_ulogic_vector(31 downto 0);
     begin
-        -- Byteswapped input
-        be_in := bswap(wb_in.dat);
-
         if rising_edge(clk) then
             if rst = '1' then
                 for i in 0 to SRC_NUM - 1 loop
                     xives(i) <= (pri => pri_masked);
                 end loop;
             elsif wb_valid = '1' and wb_in.we = '1' then
+                -- Byteswapped input
+                be_in := bswap(wb_in.dat);
                 if reg_is_xive then
                     -- TODO: When adding support for other bits, make sure to
                     -- properly implement wb_in.sel to allow partial writes.

diff --git a/xilinx-mult-32s.vhdl b/xilinx-mult-32s.vhdl
@@ -286,9 +286,11 @@ begin
 
     process(clk)
     begin
-        if rising_edge(clk) and stall = '0' then
-            m_out.valid <= m_in.valid;
-            product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+        if rising_edge(clk) then
+            if stall = '0' then
+                m_out.valid <= m_in.valid;
+                product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+            end if;
         end if;
     end process;