From f839818c60159a901f1fafbc4fbddab24feef20b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20J=C3=B8rgensen?= Date: Thu, 11 Jul 2024 11:32:51 +0200 Subject: [PATCH] Copy files from C64 repo This fixes a lockup seen on the MEGA65, and could be related to issue #8 --- src/Example_Design/top-R4.xdc | 4 + src/Example_Design/top.vhd | 2 +- src/Example_Design/top.xdc | 4 + src/Example_Design/top.xpr | 18 ++- src/hyperram/hyperram.vhd | 21 ---- src/hyperram/hyperram_ctrl.vhd | 19 --- src/hyperram/hyperram_fifo.vhd | 213 +++++++++++++++------------------ src/hyperram/hyperram_rx.vhd | 21 ++-- 8 files changed, 130 insertions(+), 172 deletions(-) diff --git a/src/Example_Design/top-R4.xdc b/src/Example_Design/top-R4.xdc index 8af9a4b..68f3722 100644 --- a/src/Example_Design/top-R4.xdc +++ b/src/Example_Design/top-R4.xdc @@ -80,6 +80,10 @@ set_max_delay 2 -datapath_only -from [get_cells i_core/i_hyperram/hyperram_rx_in # Prevent insertion of extra BUFG set_property CLOCK_BUFFER_TYPE NONE [get_nets -of [get_pins i_core/i_hyperram/hyperram_rx_inst/delay_rwds_inst/DATAOUT]] +# Receive FIFO: There is a CDC in the LUTRAM. +# There is approx 1.1 ns Clock->Data delay for the LUTRAM itself, plus 0.5 ns routing delay to the capture flip-flop. +set_max_delay 2 -datapath_only -from [get_clocks hr_rwds] -to [get_clocks hr_clk] + ################################################################################ # HyperRAM timing (correct for IS66WVH8M8DBLL-100B1LI) diff --git a/src/Example_Design/top.vhd b/src/Example_Design/top.vhd index ac1b5de..197fd85 100755 --- a/src/Example_Design/top.vhd +++ b/src/Example_Design/top.vhd @@ -61,7 +61,7 @@ architecture synthesis of top is signal sys_data_read : std_logic_vector(31 downto 0); signal sys_count_long : unsigned(31 downto 0); signal sys_count_short : unsigned(31 downto 0); - signal sys_count_error : unsigned(31 downto 0); + signal sys_count_error : std_logic_vector(31 downto 0); -- Interface to MEGA65 video signal sys_digits : std_logic_vector(191 downto 0); diff --git a/src/Example_Design/top.xdc b/src/Example_Design/top.xdc index cb29b03..8a61dd4 100644 --- a/src/Example_Design/top.xdc +++ b/src/Example_Design/top.xdc @@ -80,6 +80,10 @@ set_max_delay 2 -datapath_only -from [get_cells i_core/i_hyperram/hyperram_rx_in # Prevent insertion of extra BUFG set_property CLOCK_BUFFER_TYPE NONE [get_nets -of [get_pins i_core/i_hyperram/hyperram_rx_inst/delay_rwds_inst/DATAOUT]] +# Receive FIFO: There is a CDC in the LUTRAM. +# There is approx 1.1 ns Clock->Data delay for the LUTRAM itself, plus 0.5 ns routing delay to the capture flip-flop. +set_max_delay 2 -datapath_only -from [get_clocks hr_rwds] -to [get_clocks hr_clk] + ################################################################################ # HyperRAM timing (correct for IS66WVH8M8DBLL-100B1LI) diff --git a/src/Example_Design/top.xpr b/src/Example_Design/top.xpr index cbb0fd4..d9b370b 100644 --- a/src/Example_Design/top.xpr +++ b/src/Example_Design/top.xpr @@ -88,12 +88,6 @@ - - - - - - @@ -146,6 +140,18 @@ + + + + + + + + + + + + diff --git a/src/hyperram/hyperram.vhd b/src/hyperram/hyperram.vhd index 1518e77..0f81560 100644 --- a/src/hyperram/hyperram.vhd +++ b/src/hyperram/hyperram.vhd @@ -92,24 +92,6 @@ architecture synthesis of hyperram is signal ctrl_rwds_oe : std_logic; signal ctrl_rwds_in : std_logic; signal ctrl_read : std_logic; - signal ctrl_dq_error : std_logic; - - signal stat_underrun : std_logic; - signal stat_timeout : std_logic; - --- attribute mark_debug : string; --- attribute mark_debug of ctrl_ck_ddr : signal is "true"; --- attribute mark_debug of ctrl_dq_ddr_in : signal is "true"; --- attribute mark_debug of ctrl_dq_ddr_out : signal is "true"; --- attribute mark_debug of ctrl_dq_oe : signal is "true"; --- attribute mark_debug of ctrl_dq_ie : signal is "true"; --- attribute mark_debug of ctrl_rwds_ddr_out : signal is "true"; --- attribute mark_debug of ctrl_rwds_oe : signal is "true"; --- attribute mark_debug of ctrl_rwds_in : signal is "true"; --- attribute mark_debug of ctrl_read : signal is "true"; --- attribute mark_debug of ctrl_dq_error : signal is "true"; --- attribute mark_debug of stat_underrun : signal is "true"; --- attribute mark_debug of stat_timeout : signal is "true"; begin @@ -216,8 +198,6 @@ begin avm_waitrequest_o => cfg_waitrequest, count_long_o => count_long_o, count_short_o => count_short_o, - underrun_o => stat_underrun, - timeout_o => stat_timeout, hb_rstn_o => ctrl_rstn, hb_csn_o => ctrl_csn, hb_ck_ddr_o => ctrl_ck_ddr, @@ -248,7 +228,6 @@ begin ctrl_dq_ie_o => ctrl_dq_ie, ctrl_rwds_in_o => ctrl_rwds_in, ctrl_read_i => ctrl_read, - ctrl_dq_error_o => ctrl_dq_error, hr_rwds_in_i => hr_rwds_in_i, hr_dq_in_i => hr_dq_in_i ); -- hyperram_rx_inst diff --git a/src/hyperram/hyperram_ctrl.vhd b/src/hyperram/hyperram_ctrl.vhd index 84e0ba2..c5e3d92 100644 --- a/src/hyperram/hyperram_ctrl.vhd +++ b/src/hyperram/hyperram_ctrl.vhd @@ -33,8 +33,6 @@ entity hyperram_ctrl is -- Statistics count_long_o : out unsigned(31 downto 0); count_short_o : out unsigned(31 downto 0); - underrun_o : out std_logic; - timeout_o : out std_logic; -- HyperBus control signals hb_rstn_o : out std_logic; @@ -91,8 +89,6 @@ architecture synthesis of hyperram_ctrl is -- Statistics signal count_long : unsigned(31 downto 0); signal count_short : unsigned(31 downto 0); - signal hb_dq_ie_d : std_logic; - signal timeout_count : natural range 0 to 15; begin @@ -103,9 +99,6 @@ begin hb_dq_oe_o <= '0'; hb_rwds_oe_o <= '0'; hb_read_o <= '0'; - underrun_o <= '0'; - timeout_o <= '0'; - hb_dq_ie_d <= hb_dq_ie_i; case state is when INIT_ST => @@ -171,7 +164,6 @@ begin read_clk_count <= burst_count+1; read_return_count <= burst_count; hb_read_o <= '1'; - timeout_count <= 15; state <= READ_ST; else write_clk_count <= burst_count; @@ -189,18 +181,7 @@ begin hb_ck_ddr_o <= "00"; end if; - if hb_dq_ie_d = '1' and hb_dq_ie_i = '0' then - underrun_o <= '1'; - end if; - - if timeout_count > 0 then - timeout_count <= timeout_count - 1; - else - timeout_o <= '1'; - end if; - if hb_dq_ie_i = '1' then - timeout_count <= 1; read_return_count <= read_return_count - 1; if read_return_count = 1 then hb_csn_o <= '1'; diff --git a/src/hyperram/hyperram_fifo.vhd b/src/hyperram/hyperram_fifo.vhd index c9ae525..22786f9 100644 --- a/src/hyperram/hyperram_fifo.vhd +++ b/src/hyperram/hyperram_fifo.vhd @@ -1,14 +1,11 @@ -- This is part of the HyperRAM Rx connections. --- It is a general-purpose shallow (two-element) asynchronuous FIFO. +-- It is a general-purpose shallow asynchronuous FIFO. -- -- Created by Michael Jørgensen in 2023 (mjoergen.github.io/HyperRAM). library ieee; use ieee.std_logic_1164.all; - use ieee.numeric_std.all; - -library xpm; - use xpm.vcomponents.all; + use ieee.numeric_std_unsigned.all; entity hyperram_fifo is generic ( @@ -20,156 +17,140 @@ entity hyperram_fifo is src_data_i : in std_logic_vector(G_DATA_SIZE - 1 downto 0); dst_clk_i : in std_logic; dst_valid_o : out std_logic; - dst_data_o : out std_logic_vector(G_DATA_SIZE - 1 downto 0); - dst_error_o : out std_logic + dst_data_o : out std_logic_vector(G_DATA_SIZE - 1 downto 0) ); end entity hyperram_fifo; architecture synthesis of hyperram_fifo is - -- Input registers in source clock domain - signal src_registers : std_logic_vector(2 * G_DATA_SIZE - 1 downto 0); + -- Number of bits in gray-code counters + constant C_GRAY_SIZE : natural := 3; - -- Write pointer in source clock domain - signal src_gray_wr : std_logic_vector(1 downto 0) := "00"; + -- Number of words in FIFO + constant C_FIFO_SIZE : natural := 2 ** (C_GRAY_SIZE - 1); - -- This instructs Vivado to leave the write pointer literally "as is", instead of - -- potentially making some FSM optimization. The reason is that we want the input of - -- the CDC to be directly from registers, rather than there being any extra - -- combinational logic. - attribute fsm_encoding : string; - attribute fsm_encoding of src_gray_wr : signal is "none"; + -- Dual-port LUTRAM memory to contain the FIFO data + -- We use LUTRAM instead of registers to save space in the FPGA. + -- We could use BRAM, but there is a higher delay writing to BRAM than to LUTRAM. + type ram_type is array (natural range <>) of std_logic_vector(G_DATA_SIZE - 1 downto 0); + signal dpram : ram_type(0 to C_FIFO_SIZE - 1); + attribute ram_style : string; + attribute ram_style of dpram : signal is "distributed"; - -- Input registers in destination clock domain - signal dst_registers : std_logic_vector(2 * G_DATA_SIZE - 1 downto 0); + -- We're using gray codes to avoid glitches when transferring between clock domains. - -- Write pointer in destination clock domain - signal dst_gray_wr : std_logic_vector(1 downto 0) := "00"; + -- Write pointer (gray code) in source clock domain + signal src_gray_wr : std_logic_vector(C_GRAY_SIZE - 1 downto 0) := (others => '0'); - -- Read pointer in destination clock domain - signal dst_gray_rd : std_logic_vector(1 downto 0) := "00"; + -- Write pointer (gray code) in destination clock domain + signal dst_gray_wr : std_logic_vector(C_GRAY_SIZE - 1 downto 0) := (others => '0'); - signal overflow : std_logic := '0'; + -- Read pointer (gray code) in destination clock domain + signal dst_gray_rd : std_logic_vector(C_GRAY_SIZE - 1 downto 0) := (others => '0'); -begin + -- Handle CDC + -- There must additionally be an explicit set_max_delay in the constraint file. + signal dst_gray_wr_meta : std_logic_vector(C_GRAY_SIZE - 1 downto 0) := (others => '0'); + attribute async_reg : string; + attribute async_reg of dst_gray_wr_meta : signal is "true"; + attribute async_reg of dst_gray_wr : signal is "true"; - src_proc : process (src_clk_i) + -- Convert binary to gray code + + pure function bin2gray ( + b : std_logic_vector + ) return std_logic_vector is + variable g_v : std_logic_vector(b'range); begin - if rising_edge(src_clk_i) then - if src_valid_i = '1' then + g_v(b'left) := b(b'left); - case src_gray_wr is + for i in b'left-1 downto b'right loop + g_v(i) := b(i + 1) xor b(i); + end loop; - when "00" => - src_gray_wr <= "01"; - src_registers(G_DATA_SIZE - 1 downto 0) <= src_data_i; + return g_v; + end function bin2gray; - when "01" => - src_gray_wr <= "11"; - src_registers(2 * G_DATA_SIZE - 1 downto G_DATA_SIZE) <= src_data_i; + -- Convert gray code to binary - when "10" => - src_gray_wr <= "00"; - src_registers(2 * G_DATA_SIZE - 1 downto G_DATA_SIZE) <= src_data_i; + pure function gray2bin ( + g : std_logic_vector + ) return std_logic_vector is + variable b_v : std_logic_vector(g'range); + begin + b_v(g'left) := g(g'left); - when "11" => - src_gray_wr <= "10"; - src_registers(G_DATA_SIZE - 1 downto 0) <= src_data_i; + for i in g'left-1 downto g'right loop + b_v(i) := b_v(i + 1) xor g(i); + end loop; - when others => - null; + return b_v; + end function gray2bin; - end case; +begin + -- Dual port memory: One write port, and one read port. + -- The memory is implemented with LUTRAM. There is no + -- need for a complete CDC circuit on the output of the LUTRAM, a simple + -- flip-flop is sufficient. This is because the contents being read from the LUTRAM is + -- not changing at the time it is sampled. This is due to the CDC causing a (usually) two-cycle + -- delay between writing to and reading from a given memory location. + dpram_proc : process (src_clk_i, dst_clk_i) + variable index_v : natural range 0 to C_FIFO_SIZE - 1; + begin + -- Write to memory + if rising_edge(src_clk_i) then + if src_valid_i = '1' then + index_v := to_integer(gray2bin(src_gray_wr)) mod C_FIFO_SIZE; + dpram(index_v) <= src_data_i; + end if; + end if; + + -- Read from memory + if rising_edge(dst_clk_i) then + if dst_gray_wr /= dst_gray_rd then + index_v := to_integer(gray2bin(dst_gray_rd)) mod C_FIFO_SIZE; + dst_data_o <= dpram(index_v); + end if; + end if; + end process dpram_proc; + + -- Update write pointer + src_proc : process (src_clk_i) + variable index_v : natural range 0 to C_FIFO_SIZE - 1; + begin + if rising_edge(src_clk_i) then + if src_valid_i = '1' then + src_gray_wr <= bin2gray(gray2bin(src_gray_wr) + 1); end if; end if; end process src_proc; - xpm_cdc_array_single_ram_inst : component xpm_cdc_array_single - generic map ( - DEST_SYNC_FF => 2, - INIT_SYNC_FF => 0, - SIM_ASSERT_CHK => 0, - SRC_INPUT_REG => 0, - WIDTH => 2 * G_DATA_SIZE - ) - port map ( - src_clk => '0', - src_in => src_registers, - dest_clk => dst_clk_i, - dest_out => dst_registers - ); -- xpm_cdc_array_single_ram_inst - - -- Note that the write pointer is delayed one additional clock cycle (3) compared to - -- that of the input registers (2). This is to account for any skew there may be in the - -- Clock Domain Crossing. In short, we want to make sure that the input registers - -- (src_registers) are updated no later than the write pointer (src_gray_wr). - xpm_cdc_array_single_gray_wr_inst : component xpm_cdc_array_single - generic map ( - DEST_SYNC_FF => 3, - INIT_SYNC_FF => 0, - SIM_ASSERT_CHK => 0, - SRC_INPUT_REG => 0, - WIDTH => 2 - ) - port map ( - src_clk => '0', - src_in => src_gray_wr, - dest_clk => dst_clk_i, - dest_out => dst_gray_wr - ); -- xpm_cdc_array_single_gray_wr_inst + -- Handle CDC explicitly. + -- We won't use the Xilinx XPM primitive, because that includes a set_false_path. + -- Instead, we use a set_max_delay in the constraints. + async_proc : process (dst_clk_i) + begin + if rising_edge(dst_clk_i) then + dst_gray_wr_meta <= src_gray_wr; + dst_gray_wr <= dst_gray_wr_meta; + end if; + end process async_proc; -- Forward data, one word at a time, as soon as the write pointer is different from -- the read pointer. dst_proc : process (dst_clk_i) + variable index_v : natural range 0 to C_FIFO_SIZE - 1; begin if rising_edge(dst_clk_i) then dst_valid_o <= '0'; - overflow <= '0'; if dst_gray_wr /= dst_gray_rd then - - case dst_gray_rd is - - when "00" => - if dst_gray_wr /= "01" then - overflow <= '1'; - end if; - dst_data_o <= dst_registers(G_DATA_SIZE - 1 downto 0); - dst_gray_rd <= "01"; - - when "01" => - if dst_gray_wr /= "11" then - overflow <= '1'; - end if; - dst_data_o <= dst_registers(2 * G_DATA_SIZE - 1 downto G_DATA_SIZE); - dst_gray_rd <= "11"; - - when "10" => - if dst_gray_wr /= "00" then - overflow <= '1'; - end if; - dst_data_o <= dst_registers(2 * G_DATA_SIZE - 1 downto G_DATA_SIZE); - dst_gray_rd <= "00"; - - when "11" => - if dst_gray_wr /= "10" then - overflow <= '1'; - end if; - dst_data_o <= dst_registers(G_DATA_SIZE - 1 downto 0); - dst_gray_rd <= "10"; - - when others => - null; - - end case; - + dst_gray_rd <= bin2gray(gray2bin(dst_gray_rd) + 1); dst_valid_o <= '1'; end if; end if; end process dst_proc; - dst_error_o <= overflow; - end architecture synthesis; diff --git a/src/hyperram/hyperram_rx.vhd b/src/hyperram/hyperram_rx.vhd index 682c7e8..4295099 100644 --- a/src/hyperram/hyperram_rx.vhd +++ b/src/hyperram/hyperram_rx.vhd @@ -25,7 +25,6 @@ entity hyperram_rx is ctrl_dq_ie_o : out std_logic; ctrl_rwds_in_o : out std_logic; ctrl_read_i : in std_logic; - ctrl_dq_error_o : out std_logic; -- Connect to HyperRAM device hr_rwds_in_i : in std_logic; @@ -38,8 +37,9 @@ architecture synthesis of hyperram_rx is signal rwds_dq_in : std_logic_vector(15 downto 0); signal rwds_in_delay : std_logic; - signal ctrl_dq_ie : std_logic; - signal ctrl_dq_ie_d : std_logic; + signal ctrl_dq_ie : std_logic; + signal ctrl_dq_ie_d : std_logic; + signal ctrl_dq_ie_d2 : std_logic; begin @@ -53,13 +53,13 @@ begin -- Delay the input RWDS signal by approx 2.5 ns (90 degrees). -- Each tap is on average 1/32 of the period of delay_refclk_i (here 5 ns), - -- but the taps are not evenly spaced. Therefore a value of 21 (rather than 16) + -- but the taps are not evenly spaced. Therefore a value of 20 (rather than 16) -- is used. The actual amount of delay can be read from the timing report. delay_rwds_inst : component idelaye2 generic map ( IDELAY_TYPE => "FIXED", DELAY_SRC => "IDATAIN", - IDELAY_VALUE => 21, -- Number of taps. + IDELAY_VALUE => 20, HIGH_PERFORMANCE_MODE => "TRUE", SIGNAL_PATTERN => "CLOCK", REFCLK_FREQUENCY => 200.0, -- Each tap on average 5/32 ns. @@ -128,19 +128,22 @@ begin src_data_i => rwds_dq_in, dst_clk_i => clk_i, dst_data_o => ctrl_dq_ddr_in_o, - dst_valid_o => ctrl_dq_ie, - dst_error_o => ctrl_dq_error_o + dst_valid_o => ctrl_dq_ie ); -- hyperram_fifo_inst -- This skips the first clock cycle of data from the FIFO. ctrl_dq_ie_d_proc : process (clk_i) begin if rising_edge(clk_i) then - ctrl_dq_ie_d <= ctrl_dq_ie; + ctrl_dq_ie_d <= ctrl_dq_ie; -- delayed version of data valid + ctrl_dq_ie_d2 <= ctrl_dq_ie_d; -- we need past-2 time also end if; end process ctrl_dq_ie_d_proc; - ctrl_dq_ie_o <= ctrl_dq_ie_d and ctrl_dq_ie; + -- if it was low for 2 clock cycles then we cut out the valid signal + -- it works as long as there is never 2 clocks without fifo valid + -- but this is so by design, there is max 1 clock cycle where fifo data is not available + ctrl_dq_ie_o <= (ctrl_dq_ie_d or ctrl_dq_ie_d2) and ctrl_dq_ie; end architecture synthesis;