-----------------------------------------------------------------
-- uCore 1.90 - uCntrl.vhd                                      --
-----------------------------------------------------------------
--
-- Author: KLAUS SCHLEISIEK
-- Last change: KS 09.12.2015 18:02:12
--
-- Do not use this file except in compliance with the License. You may
-- obtain a copy of the License at http://www.microcore.org/License/
-- Software distributed under the License is distributed on an "AS IS" basis,
-- WITHOUT WARRANTY OF ANY KIND, either express or implied.
-- See the License for the specific language governing rights and limitations
-- under the License.
--
-- The Initial Developer of the Original Code is Klaus.Schleisiek AT microcore.org.
--
-- The uCore engine

LIBRARY IEEE;     
USE IEEE.STD_LOGIC_1164.ALL;
USE IEEE.STD_LOGIC_signed.ALL;
USE work.functions.ALL;
USE work.constants.ALL;

ENTITY microcontrol IS
PORT (uBus        : IN  uBus_port;
      reset       : IN  STD_LOGIC;
      except      : IN  STD_LOGIC;
      disable     : IN  STD_LOGIC;
      uReg        : OUT core_signals;
      progmem     : OUT progmem_port;
      prog_din    : IN  inst_bus;
      datamem		: OUT datamem_port;
      data_din    : IN  data_bus
     );
END microcontrol;

ARCHITECTURE rtl OF microcontrol IS

ALIAS  clk          : STD_LOGIC IS uBus.clk;
ALIAS  clk_en       : STD_LOGIC IS uBus.clk_en;
ALIAS  read_en      : STD_LOGIC IS uBus.read_en;
ALIAS  sources      : data_sources IS uBus.sources;

SIGNAL core_en      : STD_LOGIC;

CONSTANT dw         : NATURAL := data_width; -- short hand

-- uCore registers
TYPE  uCore_registers  IS RECORD
   tos              : data_bus;
   nos              : data_bus;
   tor              : data_bus;
   dsp              : ds_addr;
   rsp              : rs_addr;
   pc               : program_addr;
   inst             : inst_bus;
END RECORD;

SIGNAL r            : uCore_registers;
SIGNAL r_in         : uCore_registers;
SIGNAL status       : status_bus;
SIGNAL task         : data_addr;

-- async signals
SIGNAL flags        : flag_bus;

-- status register
SIGNAL s            : status_register;
SIGNAL s_in         : status_register;
SIGNAL zero         : STD_LOGIC;
SIGNAL s_z          : STD_LOGIC;
SIGNAL sign         : STD_LOGIC;
SIGNAL s_n          : STD_LOGIC;

-- data memory bus signals
SIGNAL mem_en       : STD_LOGIC; -- output to memory
SIGNAL mem_wr       : STD_LOGIC; -- output to memory
SIGNAL mem_addr     : data_bus;  -- output to memory
SIGNAL mem_dout     : data_bus;  -- output to memory
SIGNAL reg_addr     : reg_bus;

-- program memory
SIGNAL paddr        : program_addr;
SIGNAL pwrite       : STD_LOGIC;

-- data stack
SIGNAL dst_din      : data_bus;
SIGNAL dst_dout     : data_bus;
SIGNAL dst_addr     : ds_addr;
SIGNAL dst_wr       : STD_LOGIC;
SIGNAL dst_en       : STD_LOGIC;

-- arithmetic
SIGNAL cin          : STD_LOGIC;
SIGNAL ladd_x       : STD_LOGIC_VECTOR(data_width   DOWNTO 0);
SIGNAL ladd_y       : STD_LOGIC_VECTOR(data_width   DOWNTO 0);
SIGNAL ladd_out     : STD_LOGIC_VECTOR(data_width+1 DOWNTO 0);
SIGNAL add_x        : data_bus;
SIGNAL add_y        : data_bus;
SIGNAL sum          : data_bus;
SIGNAL multiplier   : STD_LOGIC_VECTOR(data_width DOWNTO 0) := (OTHERS => '0');
SIGNAL multiplicand : STD_LOGIC_VECTOR(data_width DOWNTO 0) := (OTHERS => '0');
SIGNAL product      : STD_LOGIC_VECTOR(data_width*2+1 DOWNTO 0);

-- interrupts & exceptions
SIGNAL ienable      : int_flags;
SIGNAL pending      : int_flags;
SIGNAL interrupt    : STD_LOGIC;
SIGNAL exception    : STD_LOGIC;

-- timer
CONSTANT time_cnt   : NATURAL := (sys_frequency/(1000*ticks_per_ms))-1;
SIGNAL time_ctr     : NATURAL RANGE 0 TO time_cnt; -- divides system clock
SIGNAL time         : data_bus;
SIGNAL tick         : STD_LOGIC;

BEGIN

datamem.enable    <= mem_en;
datamem.write     <= mem_wr;
datamem.addr      <= mem_addr;
datamem.reg_addr  <= reg_addr;
datamem.dout      <= mem_dout;

progmem.enable    <= '1';
progmem.write     <= pwrite;
progmem.addr      <= paddr;
progmem.dout      <= r.nos(inst_width-1 DOWNTO 0);
                  
uReg.int          <= slice('0', data_width-interrupts) & pending;
uReg.task         <= slice('0', data_width-data_addr_width) & task;
uReg.time         <= time;
uReg.tick         <= tick;

status(lit_bit)   <= s.lit;
status(n_bit)     <= s_n;
status(z_bit)     <= s_z;
status(c_bit)     <= s.c;
status(ovfl_bit)  <= s.ovfl;
status(ie_bit)    <= s.ie;
status(iis_bit)   <= s.iis;
status(div_bit)   <= s.div;
status(den_bit)   <= s.den;
status(word_bit)  <= s.word;
status(unfl_bit)  <= s.unfl;
status(fwrd_bit)  <= s.fwrd;

-----------------------------------------------------------------------
-- internal registers and data stack
-----------------------------------------------------------------------

core_en <= clk_en AND NOT exception AND NOT disable;
flags   <= sources(FLAG_REG)(flag_width-1 DOWNTO 0);

uRegs: PROCESS(clk, reset)
BEGIN
   IF  reset = '1' AND async_reset  THEN
      r <= (OTHERS => (OTHERS => '0'));
      r.rsp <= (OTHERS => '1'); -- return stack grows towards lower addresses
      r.inst <= op_NOP;
   ELSIF  rising_edge(clk)  THEN
      IF  reset = '1' AND NOT async_reset  THEN
         r <= (OTHERS => (OTHERS => '0'));
         r.rsp <= (OTHERS => '1'); -- return stack grows towards lower addresses
         r.inst <= op_NOP;
      ELSIF  clk_en = '1' AND disable = '0'  THEN  -- pc and instruction also updated on exception
         r.pc <= r_in.pc;
         r.inst <= r_in.inst;
         IF  exception = '0'  THEN
            r <= r_in;
         END IF;
      END IF;
   END IF;
END PROCESS uRegs;

make_asyn_stack: IF  NOT syn_stackram  GENERATE

   data_stack: asynch_ram GENERIC MAP(data_width, r.dsp'high+1)
   PORT MAP(clk   => clk,
            we    => dst_wr,
            addr  => dst_addr,
            di    => dst_dout,
            do    => dst_din
           );

END GENERATE make_asyn_stack; make_syn_stack: IF  syn_stackram  GENERATE

   dst_en <= '1';

   data_stack: internal_ram GENERIC MAP(data_width, r.dsp'high+1)
   PORT MAP(clk   => clk,
            en    => dst_en,
            we    => dst_wr,
            addr  => dst_addr,
            di    => dst_dout,
            do    => dst_din
           );

END GENERATE make_syn_stack;

------------------------------------------------------------------------------
-- task register
------------------------------------------------------------------------------

if_make_task: IF  with_tasks  GENERATE

   task_register: PROCESS(clk, reset)
   BEGIN
      IF  reset = '1' AND async_reset  THEN
         task <= (OTHERS => '0');
      ELSIF  rising_edge(clk)  THEN
         IF  reset = '1' AND NOT async_reset  THEN
            task <= (OTHERS => '0');
         ELSIF  core_en = '1' AND uREG_write(uBus, TASK_REG)  THEN
            task <= r.nos(task'high DOWNTO 0);
         END IF;
      END IF;
   END PROCESS task_register;

END GENERATE if_make_task; else_make_task: IF  NOT with_tasks  GENERATE

   task <= (OTHERS => '0');

END GENERATE else_make_task;

-----------------------------------------------------------------------
-- interrupt processing
-----------------------------------------------------------------------

interrupt_services: IF  interrupts /= 0  GENERATE

   sync_in: PROCESS (reset, clk)
   BEGIN
      IF  reset = '1' AND async_reset  THEN
         ienable <= (OTHERS => '0');
         pending <= (OTHERS => '0');
      ELSIF  rising_edge(clk)  THEN
         IF  reset = '1' AND NOT async_reset  THEN
            ienable <= (OTHERS => '0');
            pending <= (OTHERS => '0');
         ELSE
            IF  read_en = '1'  THEN
               pending <= (flags(interrupts-1 DOWNTO 0) AND ienable);
            END IF;
            IF  core_en = '1' AND uReg_write(uBus, INT_REG)  THEN
               IF  r.nos(signbit) = '0'  THEN
                  ienable <= ienable OR  r.nos(interrupts-1 DOWNTO 0);
               ELSE
                  ienable <= ienable AND r.nos(interrupts-1 DOWNTO 0);
               END IF;
            END IF;
         END IF;
      END IF;
   END PROCESS sync_in;

   interrupt <= '1' WHEN  s.ie = '1' AND s.iis = '0' AND pending /= 0 AND r.inst /= op_INT  ELSE  '0';

END GENERATE interrupt_services; no_interrupt_services: IF  interrupts = 0  GENERATE

   interrupt <= '0';

END GENERATE no_interrupt_services;

-----------------------------------------------------------------------
-- exceptions and time
-- semaphore flags: raise exception when storing a '1' into a flag, which is '1'
-----------------------------------------------------------------------

exception <= '1' WHEN  uReg_write(uBus, FLAG_REG) AND (r.nos(flag_width-1 DOWNTO 0) AND flags) /= 0  ELSE  except;

time_counter : PROCESS (clk, reset)
BEGIN
   IF  reset = '1' AND async_reset  THEN
      time <= (OTHERS => '0');
      time_ctr <= 0;
      tick <= '0';
   ELSIF  rising_edge(clk)  THEN
      IF  reset = '1' AND NOT async_reset  THEN
         time <= (OTHERS => '0');
         time_ctr <= 0;
         tick <= '0';
      ELSE
         tick <= '0';
         IF  time_ctr = 0  THEN
            IF  simulation  THEN  time_ctr <= time_cnt/100;  ELSE  time_ctr <= time_cnt;  END IF;
            time <= time + 1;
            tick <= '1';
         ELSE
            time_ctr <= time_ctr - 1;
         END IF;
      END IF;
   END IF;
END PROCESS time_counter;

------------------------------------------------------------------------------
-- status_register 
------------------------------------------------------------------------------

sign <= r.tos(r.tos'high);
s_n  <= sign  WHEN  s.lit = '0'  ELSE  s.n;

zero <= '1' WHEN  r.tos = 0  ELSE '0';
s_z  <= zero  WHEN  s.lit = '0'  ELSE  s.z;

status_bits: PROCESS(clk, reset)
BEGIN
   IF  reset = '1' AND async_reset  THEN
      s <= (OTHERS => '0');
      s.fwrd <= '1';
   ELSIF  rising_edge(clk)  THEN
      IF  reset = '1' AND NOT async_reset  THEN
         s <= (OTHERS => '0');
         s.fwrd <= '1';
      ELSIF  core_en = '1'  THEN
         s <= s_in;
      END IF;
   END IF;
END PROCESS status_bits;

------------------------------------------------------------------------------
-- 33x33 adder - instantiate technology specific adders here
------------------------------------------------------------------------------

ladd_out <= ('0' & ladd_x) + ('0' & ladd_y) + cin;
	
sum <= ladd_out(sum'high DOWNTO 0);

------------------------------------------------------------------------------
-- 32x32 unsigned multiplier - instantiate technology specific multipliers here
------------------------------------------------------------------------------

product <= multiplicand * multiplier;

------------------------------------------------------------------------------
-- instruction decoder
------------------------------------------------------------------------------

uCore_control: PROCESS
   (uBus, r, r_in, time, task,
    ladd_x, ladd_y, cin, ladd_out, add_x, add_y, sum,
    multiplicand, multiplier, product,
    status, s, zero, s_z, sign, s_n,
    mem_wr, mem_addr, data_din,
    paddr, prog_din, dst_din, dst_dout,
    core_en, exception, interrupt
   )

   VARIABLE rsp_plus     : rs_addr;
   VARIABLE rsp_minus    : rs_addr;
   VARIABLE dsp_plus     : ds_addr;
   VARIABLE dsp_minus    : ds_addr;
   VARIABLE tos_plus     : ds_addr;
   VARIABLE overflow     : STD_LOGIC;
   VARIABLE nos_zero     : STD_LOGIC;
   VARIABLE tos_power2   : data_bus;
   VARIABLE temp         : data_bus;
-- floating point
   VARIABLE fexp         : exponent;
   VARIABLE mantissa     : data_bus;
   CONSTANT exp_min      : data_bus := (slice('1', data_width - exp_width + 1) & slice('0', exp_width-1));
   CONSTANT fmax_pos     : data_bus := ('0' & slice('1', data_width-1 - exp_width) & slice('1', exp_width));
   CONSTANT fmax_neg     : data_bus := ('1' & slice('0', data_width-1 - exp_width) & slice('1', exp_width));
   CONSTANT zero_pos     : data_bus := ('0' & slice('0', data_width-1));
   CONSTANT zero_neg     : data_bus := ('1' & slice('0', data_width-1));

   ALIAS nibble          : STD_LOGIC_VECTOR(6 DOWNTO 0) IS r.inst(6 DOWNTO 0);
   ALIAS i_group         : STD_LOGIC_VECTOR(2 DOWNTO 0) IS r.inst(2 DOWNTO 0);
   ALIAS i_usr           : STD_LOGIC_VECTOR(4 DOWNTO 0) IS r.inst(4 DOWNTO 0);

   ALIAS add_sign        : STD_LOGIC IS ladd_out(data_width-1);
   ALIAS add_carry       : STD_LOGIC IS ladd_out(data_width  );
   ALIAS div_sign        : STD_LOGIC IS ladd_out(data_width  );
   ALIAS div_carry       : STD_LOGIC IS ladd_out(data_width+1);
   
   PROCEDURE mem_write IS
   BEGIN
      mem_en <= '1';
      mem_wr <= '1';
   END mem_write;

   PROCEDURE mem_read IS
   BEGIN
      mem_en <= '1';
   END mem_read;

   PROCEDURE push_stack IS
   BEGIN
     r_in.nos <= r.tos;
     dst_dout <= r.nos;
     dst_addr <= dsp_plus;
     dst_wr <= core_en;
     r_in.dsp <= dsp_plus;
   END push_stack;
   
   PROCEDURE pop_stack IS
   BEGIN
     r_in.tos <= r.nos;
     r_in.nos <= dst_din;
     r_in.dsp <= dsp_minus;
     dst_addr <= r.dsp;
     dst_wr <= '0';
   END pop_stack;
   
   PROCEDURE push_rstack IS
   BEGIN
     mem_write;
     mem_dout <= r.tor;
     IF  data_addr_width > rs_base_width  THEN
        mem_addr <= slice('0', data_width-rs_base_width)   & slice('1', rs_base_width   - (rs_addr_width+tasks_addr_width)) & rsp_minus;
     ELSE
        mem_addr <= slice('0', data_width-data_addr_width) & slice('1', data_addr_width - (rs_addr_width+tasks_addr_width)) & rsp_minus;
     END IF;
     r_in.rsp <= rsp_minus;
   END push_rstack;
   
   PROCEDURE pop_rstack IS
   BEGIN
     r_in.rsp <= rsp_plus;
     r_in.tor <= data_din;
     mem_read;
     IF  data_addr_width > rs_base_width  THEN
        mem_addr <= slice('0', data_width-rs_base_width)   & slice('1', rs_base_width   - (rs_addr_width+tasks_addr_width)) & r.rsp;
     ELSE
        mem_addr <= slice('0', data_width-data_addr_width) & slice('1', data_addr_width - (rs_addr_width+tasks_addr_width)) & r.rsp;
     END IF;
   END pop_rstack;
   
   PROCEDURE call_trap (i : IN STD_LOGIC_VECTOR(4 DOWNTO 0)) IS
   BEGIN
      push_rstack;
      r_in.tor <= slice('0', data_width-prog_addr_width) & r.pc;
      paddr <= slice('0', (prog_addr_width-(inst_width-3+usr_vect_width))) & i & slice('0', usr_vect_width);
   END call_trap;

   PROCEDURE branch IS
   BEGIN
    IF  s.lit = '0'  THEN
       paddr <= r.tos(prog_addr_width-1 DOWNTO 0);
    ELSE
       paddr <= r.pc + r.tos(prog_addr_width-1 DOWNTO 0);   -- additional full adder, not time critical
      END IF;
   END branch;

   PROCEDURE conditional (flag : IN STD_LOGIC) IS
   BEGIN
      pop_stack;
      IF  flag = '1'  THEN
         branch;
      END IF;
   END conditional;

   PROCEDURE status_wr IS
   BEGIN
      s_in.c    <= r.tos(c_bit);
      s_in.ovfl <= r.tos(ovfl_bit);
      s_in.ie   <= r.tos(ie_bit);
      s_in.iis  <= r.tos(iis_bit);
      s_in.lit  <= r.tos(lit_bit);
      IF  r.tos(lit_bit) = '1'  THEN
         s_in.z <= r.tos(z_bit);
         s_in.n <= r.tos(n_bit);
      END IF;
      s_in.div  <= r.tos(div_bit);
      s_in.den  <= r.tos(den_bit);
      s_in.word <= r.tos(word_bit);
      s_in.unfl <= r.tos(unfl_bit);
      s_in.fwrd <= r.tos(fwrd_bit);
   END status_wr;

BEGIN
   
   IF  tasks_addr_width = 0  THEN
      dsp_plus  := r.dsp + 1;
      tos_plus  := r.tos(r.dsp'high DOWNTO 0) + 1;
      dsp_minus := r.dsp - 1;
      rsp_plus  := r.rsp + 1;
      rsp_minus := r.rsp - 1;
   ELSE
      dsp_plus  := r.dsp(r.dsp'high DOWNTO ds_addr_width) &  r.dsp(ds_addr_width-1 DOWNTO 0) + 1;
      tos_plus  := r.tos(r.dsp'high DOWNTO ds_addr_width) &  r.tos(ds_addr_width-1 DOWNTO 0) + 1;
      dsp_minus := r.dsp(r.dsp'high DOWNTO ds_addr_width) &  r.dsp(ds_addr_width-1 DOWNTO 0) - 1;
      rsp_plus  := r.rsp(r.rsp'high DOWNTO rs_addr_width) &  r.rsp(rs_addr_width-1 DOWNTO 0) + 1;
      rsp_minus := r.rsp(r.rsp'high DOWNTO rs_addr_width) &  r.rsp(rs_addr_width-1 DOWNTO 0) - 1;
   END IF;
   
   IF  r.nos = 0  THEN
      nos_zero := '1';
   ELSE
      nos_zero := '0';
   END IF;

   FOR i IN 0 TO data_width-1 LOOP
      IF  r.tos(r.tos'high) = '0'  THEN
         IF  i = conv_INTEGER(r.tos)  THEN
            tos_power2(i) := '1';
         ELSE
            tos_power2(i) := '0';
         END IF;
      ELSE
         IF  (i - data_width) = conv_INTEGER(r.tos)  THEN
            tos_power2(i) := '1';
         ELSE
            tos_power2(i) := '0';
         END IF;
      END IF;
   END LOOP;

-- status and uCore registers
   s_in <= s;
   s_in.lit <= '0';
   s_in.n <= sign;
   s_in.z <= zero;
   r_in <= r;

-- data stack memory
   dst_wr <= '0';
   dst_addr <= r.dsp;
   dst_dout <= r.nos;

-- arithmetic
   overflow := '0';
   cin <= '0';
   add_x <= r.tos;
   add_y <= r.nos;
   ladd_x <= '0' & add_x;
   ladd_y <= '0' & add_y;

   multiplicand <= '0' & r.nos;
   multiplier <= '0' & r.tos;

-- floating point
   fexp := NOT r.tos(exp_width-1) & r.tos(exp_width-2 DOWNTO 0);

-- data memory
   mem_en <= '0';
   mem_wr <= '0';
   mem_addr <= sum;
   mem_dout <= r.nos;
   reg_addr <= (OTHERS => '0');

-- program memory
   paddr <= r.pc;
   pwrite <= '0';

------------------------------------------------------------------------------
-- program flow, interrupt and exception
------------------------------------------------------------------------------

   IF  exception = '1'  THEN
      r_in.inst <= op_EXC;
      r_in.pc <= paddr;
   ELSIF  interrupt = '1'  THEN
      r_in.inst <= op_INT;
      r_in.pc <= paddr;
   ELSE
      r_in.inst <= prog_din;
      r_in.pc <= paddr + 1;
   END IF;

------------------------------------------------------------------------------
-- literal instructions
------------------------------------------------------------------------------

   IF  r.inst(r.inst'high) = '1'  THEN
      s_in.lit <= '1';
      IF  s.lit = '0'  THEN
         push_stack;
         r_in.tos <= slice(nibble(6), data_width-r.inst'high) & nibble;
      ELSE
         r_in.tos <= r.tos(data_width-8 DOWNTO 0) & nibble;
         s_in.z <= s.z;
         s_in.n <= s.n;
      END IF;

   ELSE -- opcodes

------------------------------------------------------------------------------
-- data memory autoincrement store and load group
------------------------------------------------------------------------------

      IF    (r.inst(7 DOWNTO 3) = op_STORE(7 DOWNTO 3))
		  AND (NOT WITH_BYTES OR NOT(r.inst = op_cST OR r.inst = op_wST OR r.inst = op_iST))
      THEN
         pop_stack;
         add_x <= r.tos;
         add_y <= slice(i_group(2), data_width-3) & i_group; -- Sign extend top bit of i_group into full width with i_group at bottom
         r_in.tos <= sum;
         mem_write;
         mem_dout <= r.nos;
         mem_addr <= sum;
         IF  sum(data_width-1) = '1'  THEN
            IF  sum(data_width-1 DOWNTO reg_addr_width) = -1  THEN
               reg_addr <= sum(reg_addr'high DOWNTO 0);
               mem_en <= '0';
            ELSIF  prog_ram_width /= 0 AND sum(data_width-2 DOWNTO prog_ram_width) = 0  THEN
               mem_en <= '0';
               mem_wr <= '0';
               pwrite <= '1';
               IF  prog_addr_width = data_width  THEN
                  paddr <= '0' & sum(prog_addr_width-1-1 DOWNTO 0);
               ELSE
                  paddr <= sum(prog_addr_width-1 DOWNTO 0);
               END IF;
               r_in.inst <= op_NOP;
               r_in.pc <= r.pc;
            END IF;
         END IF;
      END IF;

      IF    (r.inst(7 DOWNTO 3) = op_LOAD(7 DOWNTO 3))
		  AND (NOT WITH_BYTES OR NOT(r.inst = op_cLD OR r.inst = op_wLD OR r.inst = op_iLD))
      THEN
         push_stack;
         r_in.nos <= data_din;
         add_x <= r.tos;
         add_y <= slice(i_group(2), data_width-3) & i_group; -- Sign extend top bit of i_group into full width with i_group at bottom
         r_in.tos <= sum;
         mem_addr <= sum;
         mem_read;
         IF  sum(data_width-1) = '1'  THEN
            IF  sum(data_width-1 DOWNTO reg_addr_width) = -1  THEN
               mem_en <= '0';
               reg_addr <= sum(reg_addr'high DOWNTO 0);
               r_in.nos <= sources(conv_INTEGER(sum(reg_addr_width DOWNTO 0))); -- always negative addresses
            ELSIF  prog_ram_width /= 0 AND sum(data_width-2 DOWNTO prog_ram_width) = 0  THEN
               mem_en <= '0';
               IF  prog_addr_width = data_width  THEN
                  paddr <= '0' & sum(prog_addr_width-1-1 DOWNTO 0);
               ELSE
                  paddr <= sum(prog_addr_width-1 DOWNTO 0);
               END IF;
               r_in.nos <= slice('0', data_width-inst_width) & prog_din;
               r_in.inst <= op_NOP;
               r_in.pc <= r.pc;
            END IF;
         END IF;
      END IF;

------------------------------------------------------------------------------
-- single instructions
------------------------------------------------------------------------------

      CASE r.inst IS

      WHEN op_NOP   => NULL;

------------------------------------------------------------------------------
-- more data memory access
------------------------------------------------------------------------------

-- byte store instructions, cST and wST are 2 cycle read-modify-write
      WHEN op_cST => IF  WITH_BYTES  THEN
                        mem_read;
                        mem_addr <= "00" & r.tos(r.tos'high DOWNTO 2);
                        r_in.tos <= "00" & r.tos(r.tos'high DOWNTO 2);
                        CASE  r.tos(1 DOWNTO 0)  IS  -- little endian byte order
                        WHEN "00"   => r_in.nos <= data_din(dw-1 DOWNTO    8) & r.nos(7 DOWNTO 0);
                        WHEN "01"   => r_in.nos <= data_din(dw-1 DOWNTO   16) & r.nos(7 DOWNTO 0) & data_din(   7 DOWNTO 0);
                        WHEN "10"   => r_in.nos <= data_din(dw-1 DOWNTO dw-8) & r.nos(7 DOWNTO 0) & data_din(  15 DOWNTO 0);
                        WHEN OTHERS => r_in.nos <=                              r.nos(7 DOWNTO 0) & data_din(dw-9 DOWNTO 0);
                        END CASE;
                        r_in.inst <= op_STORE;
                        r_in.pc <= paddr;
                     END IF;

      WHEN op_wST => IF  WITH_BYTES  THEN
                        IF  r.tos(0) = '0'  THEN  -- valid byte address
                           mem_read;
                           mem_addr <= "00" & r.tos(r.tos'high DOWNTO 2);
                           r_in.tos <= "00" & r.tos(r.tos'high DOWNTO 2);
                           IF  r.tos(1) = '0'  THEN
                              r_in.nos <= data_din(dw-1 DOWNTO dw-16) & r.nos(15 DOWNTO 0);
                           ELSE
                              r_in.nos <= r.nos(15 DOWNTO 0) & data_din(15 DOWNTO 0);
                           END IF;
                           r_in.inst <= op_STORE;
                           r_in.pc <= paddr;
                        ELSE                  -- invalid byte address
                           r_in.tos <= r.tos(r.tos'high DOWNTO 1) & '0';
                           call_trap(op_ADDR(4 DOWNTO 0));
                        END IF;
                     END IF;

      WHEN op_iST => IF  WITH_BYTES  THEN
                        IF  r.tos(1 DOWNTO 0) = "00"  THEN -- valid byte address?
                           pop_stack;
                           r_in.tos <= "00" & r.tos(r.tos'high DOWNTO 2);
                           mem_addr <= "00" & r.tos(r.tos'high DOWNTO 2);
                           mem_write;
                           mem_dout <= r.nos;
                        ELSE                           -- invalid byte address
                           r_in.tos <= r.tos(r.tos'high DOWNTO 2) & "00";
                           call_trap(op_ADDR(4 DOWNTO 0));
                        END IF;
                     END IF;

-- byte load instructions
      WHEN op_cLD => IF  WITH_BYTES  THEN
                        push_stack;
                        mem_read;
                        mem_addr <= "00" & r.tos(r.tos'high DOWNTO 2);
                        CASE r.tos(1 DOWNTO 0) IS -- little endian byte order
                        WHEN "00"   => r_in.nos <= slice('0', data_width-8) & data_din(   7 DOWNTO  0);
                        WHEN "01"   => r_in.nos <= slice('0', data_width-8) & data_din(  15 DOWNTO  8);
                        WHEN "10"   => r_in.nos <= slice('0', data_width-8) & data_din(  23 DOWNTO 16);
                        WHEN OTHERS => r_in.nos <= slice('0', data_width-8) & data_din(dw-1 DOWNTO 24);
                        END CASE;
                        s_in.word <= '0';
                     END IF;

      WHEN op_wLD => IF  WITH_BYTES  THEN
                        IF  r.tos(0) = '0'  THEN  -- valid byte address
                           push_stack;
                           mem_read;
                           mem_addr <= "00" & r.tos(r.tos'high DOWNTO 2);
                           IF  r.tos(1) = '0'  THEN
                              r_in.nos <= slice('0', data_width-16) & data_din(15 DOWNTO  0);
                           ELSE
                              r_in.nos <= slice('0', data_width-16) & data_din(dw-1 DOWNTO 16);
                           END IF;
                           s_in.word <= '1';
                        ELSE                  -- invalid byte address
                           r_in.tos <= r.tos(r.tos'high DOWNTO 1) & '0';
                           call_trap(op_ADDR(4 DOWNTO 0));
                        END IF;
                     END IF;

      WHEN op_iLD => IF  WITH_BYTES  THEN
                        IF  r.tos(1 DOWNTO 0) = "00"  THEN -- valid byte address?
                           push_stack;
                           mem_read;
                           mem_addr <= "00" & r.tos(r.tos'high DOWNTO 2);
                           r_in.nos <= data_din;
                        ELSE                           -- invalid byte address
                           r_in.tos <= r.tos(r.tos'high DOWNTO 2) & "00";
                           call_trap(op_ADDR(4 DOWNTO 0));
                        END IF;
                     END IF;

      WHEN op_SIGNED => -- sign extension after cLD or wLD
                       IF  WITH_BYTES  THEN
                          IF  s.word = '0'  THEN
                             r_in.tos <= slice(r.tos( 7), 24) & r.tos( 7 DOWNTO 0);
                          ELSE
                             r_in.tos <= slice(r.tos(15), 16) & r.tos(15 DOWNTO 0);
                          END IF;
                       END IF;

      WHEN op_PST   => -- indivisible read-modify-write +! instruction
                       mem_read;
                       mem_addr <= r.tos;
                       add_x <= data_din;
                       add_y <= r.nos;
                       cin <= '0';
                       r_in.nos <= sum;
                       r_in.inst <= op_STORE;
                       r_in.pc <= paddr;
   
      WHEN op_INDEX => -- DO ... LOOP index computed from top two items on return stack
                       push_stack;
                       mem_read;
                       IF  data_addr_width > rs_base_width  THEN
                          mem_addr <= slice('0', data_width-rs_base_width)   & slice('1', rs_base_width   - (rs_addr_width+tasks_addr_width)) & r.rsp;
                       ELSE
                          mem_addr <= slice('0', data_width-data_addr_width) & slice('1', data_addr_width - (rs_addr_width+tasks_addr_width)) & r.rsp;
                       END IF;
                       add_x <= data_din;
                       add_y <= NOT r.tor;
                       cin <= '1';
                       r_in.nos <= sum;

------------------------------------------------------------------------------
-- return stack manipulation
------------------------------------------------------------------------------

      WHEN op_RDROP => pop_rstack;
   
      WHEN op_RPUSH => pop_stack;
                       push_rstack;
                       r_in.tor <= r.tos;
   
      WHEN op_RPOP  => pop_rstack;
                       push_stack;
                       r_in.tos <= r.tor;
                       
      WHEN op_WLOCAL => pop_stack;
                       add_x <= r.tos;
                       add_y <= slice('0', data_width-rs_base_width) & slice('1', rs_base_width-rs_addr_width-tasks_addr_width) & rsp_minus;
                       r_in.tos <= sum;
                       mem_addr <= sum;
                       mem_write;
                       mem_dout <= r.nos;

      WHEN op_RLOCAL => push_stack;
                       add_x <= r.tos;
                       add_y <= slice('0', data_width-rs_base_width) & slice('1', rs_base_width-rs_addr_width-tasks_addr_width) & rsp_minus;
                       r_in.tos <= sum;
                       mem_addr <= sum;
                       mem_read;
                       r_in.nos <= data_din;
   
------------------------------------------------------------------------------
-- data stack manipulation
------------------------------------------------------------------------------

      WHEN op_DROP  => pop_stack;

      WHEN op_NIP   => pop_stack;
                       r_in.tos <= r.tos;
   
      WHEN op_DUP   => push_stack;
   
      WHEN op_QDUP  => IF  r.tos /= 0  THEN -- ?DUP
                          push_stack;
                       END IF;

      WHEN op_OVER  => push_stack;
                       r_in.tos <= r.nos;
   
      WHEN op_UNDER => push_stack;
                       r_in.nos <= r.nos;
   
      WHEN op_TUCK  => push_stack;
                       r_in.nos <= r.nos;
                       dst_dout <= r.tos;
   
      WHEN op_SWAP  => r_in.nos <= r.tos;
                       r_in.tos <= r.nos;
   
      WHEN op_ROT   => r_in.tos <= dst_din;
                       r_in.nos <= r.tos;
                       dst_wr <= core_en;
   
      WHEN op_NROT  => r_in.tos <= r.nos;
                       r_in.nos <= dst_din;
                       dst_dout <= r.tos;
                       dst_wr <= core_en;
   
------------------------------------------------------------------------------
-- register access
------------------------------------------------------------------------------
   
      WHEN op_RTOR   => push_stack;
                        r_in.tos <= r.tor;
   
      WHEN op_WRSP   => pop_stack;
                        r_in.rsp <= r.tos(r.rsp'high DOWNTO 0);
   
      WHEN op_RRSP   => push_stack;
                        r_in.tos <= slice('0', data_width-rs_base_width) & slice('1', rs_base_width-rs_addr_width-tasks_addr_width) & r.rsp;

      WHEN op_WDSP   => pop_stack;
                        r_in.dsp <= r.tos(r.dsp'high DOWNTO 0);
                        dst_addr <= tos_plus;

      WHEN op_RDSP   => push_stack;
                        r_in.tos <= slice('0', data_width-(r.dsp'high+1)) & r.dsp;

      WHEN op_WSTAT  => pop_stack;
                        status_wr;
   
      WHEN op_RSTAT  => push_stack;
                        r_in.tos <= slice('0', data_width-status_width) & status;
   
      WHEN op_SSTAT  => pop_stack;
                        IF  r.tos(r.tos'high) = '1'  THEN  -- reset bits
                           s_in.c    <= s.c    AND r.tos(c_bit);
                           s_in.ovfl <= s.ovfl AND r.tos(ovfl_bit);
                           s_in.ie   <= s.ie   AND r.tos(ie_bit);
                           s_in.iis  <= s.iis  AND r.tos(iis_bit);
                           s_in.word <= s.word AND r.tos(word_bit);
                           s_in.unfl <= s.unfl AND r.tos(unfl_bit);
                           s_in.fwrd <= s.fwrd AND r.tos(fwrd_bit);
                        ELSE                               -- set bits
                           s_in.c    <= s.c    OR r.tos(c_bit);
                           s_in.ovfl <= s.ovfl OR r.tos(ovfl_bit);
                           s_in.ie   <= s.ie   OR r.tos(ie_bit);
                           s_in.iis  <= s.iis  OR r.tos(iis_bit);
                           s_in.word <= s.word OR r.tos(word_bit);
                           s_in.unfl <= s.unfl OR r.tos(unfl_bit);
                           s_in.fwrd <= s.fwrd OR r.tos(fwrd_bit);
                        END IF;

      WHEN op_WTASK  => IF  with_tasks   THEN
                           pop_stack;
                           add_x <= r.tos;
                           add_y <= slice('0', data_width-data_addr_width) & task;
                           r_in.tos <= sum;
                           mem_addr <= sum;
                           mem_write;
                           mem_dout <= r.nos;
                        END IF;
   
      WHEN op_RTASK  => IF  with_tasks  THEN
                           push_stack;
                           add_x <= r.tos;
                           add_y <= slice('0', data_width-data_addr_width) & task;
                           r_in.tos <= sum;
                           mem_addr <= sum;
                           mem_read;
                           r_in.nos <= data_din;
                        END IF;

------------------------------------------------------------------------------
-- call & exit
------------------------------------------------------------------------------
   
      WHEN op_EXIT   => pop_rstack;
                        paddr <= r.tor(prog_addr_width-1 DOWNTO 0);
   
      WHEN op_IRET   => pop_stack;
                        status_wr;
                        pop_rstack;
                        paddr <= r.tor(prog_addr_width-1 DOWNTO 0);

      WHEN op_CALL   => pop_stack;
                        push_rstack;
                        r_in.tor <= slice('0', data_width-prog_addr_width) & r.pc;
                        branch;

      WHEN op_ZEXIT  => pop_stack;
                        IF  s_z = '1'  THEN
                           pop_rstack;
                           paddr <= r.tor(prog_addr_width-1 DOWNTO 0);
                        END IF;
   
      WHEN op_NZEXIT => pop_stack;
                        IF  s_z = '0'  THEN
                           pop_rstack;
                           paddr <= r.tor(prog_addr_width-1 DOWNTO 0);
                        END IF;
   
      WHEN op_INT   => call_trap(i_usr);
                       push_stack;
                       r_in.tos <= slice('0', data_width-status_width) & status;
                       s_in.iis <= '1';

      WHEN op_EXC   => call_trap(i_usr);
   
      WHEN op_BREAK => call_trap(i_usr);
   
      WHEN op_DATA  => call_trap(i_usr);
   
      WHEN op_QOVFL => IF  s.ovfl = '1'  THEN
                          call_trap(i_usr);
                       END IF;
   
------------------------------------------------------------------------------
-- branches
------------------------------------------------------------------------------
   
      WHEN op_ALWAYS => conditional('1');
   
      WHEN op_QZERO  => conditional(s_z);
                        IF  s_z = '1'  THEN
                           r_in.inst <= op_DROP;
                           r_in.pc <= paddr;
                        END IF;
   
      WHEN op_SIGN   => conditional(s_n);
                        r_in.inst <= op_DROP;
                        r_in.pc <= paddr;
   
      WHEN op_NSIGN  => conditional(NOT s_n);
                        r_in.inst <= op_DROP;
                        r_in.pc <= paddr;
   
      WHEN op_ZERO   => conditional(s_z);
                        r_in.inst <= op_DROP;
                        r_in.pc <= paddr;
   
      WHEN op_NZERO  => conditional(NOT s_z);
                        r_in.inst <= op_DROP;
                        r_in.pc <= paddr;
   
      WHEN op_NOVFL  => conditional(NOT s.ovfl);

      WHEN op_NCARRY => conditional(NOT s.c);

      WHEN op_NEXT   => pop_stack;
                        IF  r.tor = 0  THEN
                           pop_rstack;
                        ELSE
                           r_in.tor <= r.tor - 1;
                           branch;
                        END IF;

      WHEN op_FWRD  => push_rstack;
                       pop_stack;
                       s_in.fwrd <= r.tos(r.tos'high);
                       IF  r.tos = 0  THEN
                          r_in.tor <= r.tos;
                          paddr <= r.pc + 1;
                       ELSIF  r.tos(r.tos'high) = '0'  THEN
                          r_in.tor <= r.tos - 1;
                       ELSE
                          r_in.tor <= NOT r.tos;
                       END IF;

      WHEN op_BACK  => IF  r.tor = 0  THEN
                          pop_rstack;
                          s_in.fwrd <= '1';
                       ELSE
                          r_in.tor <= r.tor - 1;
                          paddr <= r.pc - 2;
                       END IF;

------------------------------------------------------------------------------
-- flags
------------------------------------------------------------------------------

      WHEN op_LESSQ => IF  (s.ovfl XOR sign) = '1'  THEN
                          r_in.tos <= (OTHERS => '1');
                       ELSE
                          r_in.tos <= (OTHERS => '0');
                       END IF;
   
      WHEN op_OVFLQ  => push_stack;
                        IF  s.ovfl = '1'  THEN
                           r_in.tos <= (OTHERS => '1');
                        ELSE
                           r_in.tos <= (OTHERS => '0');
                        END IF;
   
      WHEN op_CARRYQ => push_stack;
                        IF  s.c = '1'  THEN
                           r_in.tos <= (OTHERS => '1');
                        ELSE
                           r_in.tos <= (OTHERS => '0');
                        END IF;
   
      WHEN op_TIMEQ => add_x <= r.tos;
                       add_y <= NOT time;
                       cin <= '0';
                       IF  add_sign = '1'  THEN
                          r_in.tos <= (OTHERS => '1');
                       ELSE
                          r_in.tos <= (OTHERS => '0');
                       END IF;

------------------------------------------------------------------------------
-- arithmetic
------------------------------------------------------------------------------
   
      WHEN op_NOT   => r_in.tos <= NOT r.tos;
   
      WHEN op_ZEQU  => IF  zero = '1'  THEN   -- Equal zero?: State of tos.
                          r_in.tos <= (OTHERS => '1');
                       ELSE
                          r_in.tos <= (OTHERS => '0');
                       END IF;
   
      WHEN op_ZLESS => IF  r.tos(r.tos'high) = '1'  THEN
                          r_in.tos <= (OTHERS => '1');
                       ELSE
                          r_in.tos <= (OTHERS => '0');
                       END IF;

      WHEN op_ADD  => pop_stack;
                      add_x <= r.tos;
                      add_y <= r.nos;
                      cin <= '0';
                      r_in.tos <= sum;
                      s_in.c <= add_carry;
                      overflow := (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT nos_zero;
                      s_in.ovfl <= overflow;
   
      WHEN op_ADC  => pop_stack;
                      add_x <= r.tos;
                      add_y <= r.nos;
                      cin <= s.c;
                      r_in.tos <= sum;
                      s_in.c <= add_carry;
                      overflow := '0';      -- do not set overflow for SAT_ARITH
                      s_in.ovfl <= (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT nos_zero;
   
      WHEN op_SUB  => pop_stack;
                      add_x <= NOT r.tos;
                      add_y <= r.nos;
                      r_in.tos <= sum;
                      cin <= '1';
                      s_in.c <= add_carry;
                      overflow := (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT nos_zero;
                      s_in.ovfl <= overflow;
   
      WHEN op_SSUB => pop_stack;
                      add_x <= r.tos;
                      add_y <= NOT r.nos;
                      cin <= '1';
                      r_in.tos <= sum;
                      s_in.c <= add_carry;
                      overflow := (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT zero;
                      s_in.ovfl <= overflow;
   
      WHEN op_AND  => pop_stack;
                      r_in.tos <= r.tos AND r.nos;
   
      WHEN op_OR   => pop_stack;
                      r_in.tos <= r.tos OR  r.nos;
   
      WHEN op_XOR  => pop_stack;
                      r_in.tos <= r.tos XOR r.nos;

-- 2dup <arith>
      WHEN op_PADD => push_stack;
                      add_x <= r.tos;
                      add_y <= r.nos;
                      cin <= '0';
                      r_in.tos <= sum;
                      s_in.c <= add_carry;
                      overflow := (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT nos_zero;
                      s_in.ovfl <= overflow;
   
      WHEN op_PADC => push_stack;
                      add_x <= r.tos;
                      add_y <= r.nos;
                      cin <= s.c;
                      r_in.tos <= sum;
                      s_in.c <= add_carry;
                      overflow := '0';      -- do not set overflow for SAT_ARITH
                      s_in.ovfl <= (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT nos_zero;

      WHEN op_PSUB => push_stack;
                      add_x <= NOT r.tos;
                      add_y <= r.nos;
                      r_in.tos <= sum;
                      cin <= '1';
                      s_in.c <= add_carry;
                      overflow := (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT nos_zero;
                      s_in.ovfl <= overflow;
   
      WHEN op_PSSUB=> push_stack;
                      add_x <= r.tos;
                      add_y <= NOT r.nos;
                      cin <= '1';
                      r_in.tos <= sum;
                      s_in.c <= add_carry;
                      overflow := (add_carry XOR add_sign) AND NOT(ladd_x(r.tos'high) XOR ladd_y(r.tos'high)) AND NOT zero;
                      s_in.ovfl <= overflow;
   
      WHEN op_PAND => push_stack;
                      r_in.tos <= r.tos AND r.nos;
   
      WHEN op_POR  => push_stack;
                      r_in.tos <= r.tos OR  r.nos;
   
      WHEN op_PXOR => push_stack;
                      r_in.tos <= r.tos XOR r.nos;

------------------------------------------------------------------------------
-- shift instructions
------------------------------------------------------------------------------
   
      WHEN op_SHIFT => IF  with_mult  THEN
                          pop_stack;
                          multiplicand <= '0' & r.nos;
                          multiplier   <= '0' & tos_power2;
                          IF  r.tos(r.tos'high) = '0'  THEN    -- shift left
                             r_in.tos <= product(data_width-1 DOWNTO 0);
                             s_in.c <= product(data_width);
                          ELSE                                 -- shift right
                             r_in.tos <= product(data_width*2-1 DOWNTO data_width);
                             s_in.c <= product(data_width-1);
                          END IF;
      -- bit wise shift with <NEXT
                       ELSIF  s.fwrd = '0'  THEN               -- shift left
                          r_in.tos <= r.tos(r.tos'high - 1 DOWNTO 0) & '0';
                          s_in.c <= r.tos(r.tos'high);
                       ELSE                                    -- shift right
                          r_in.tos <= '0' & r.tos(r.tos'high DOWNTO 1);
                          s_in.c <= r.tos(0);
                       END IF;

      WHEN op_ASHIFT => IF  with_mult  THEN
                          pop_stack;
                          multiplicand <= r.nos(r.nos'high) & r.nos;
                          multiplier   <= '0' & tos_power2;
                          IF  r.tos(r.tos'high) = '0'  THEN    -- shift left
                             r_in.tos <= product(data_width-1 DOWNTO 0);
                             s_in.c <= product(data_width);
                          ELSE                                 -- shift right
                             r_in.tos <= product(data_width*2-1 DOWNTO data_width);
                             s_in.c <= product(data_width-1);
                          END IF;
      -- bit wise shift with <NEXT
                       ELSIF  s.fwrd = '0'  THEN               -- shift left
                          r_in.tos <= r.tos(r.tos'high - 1 DOWNTO 0) & '0';
                       ELSE                                    -- shift right
                          r_in.tos <= r.tos(r.tos'high) & r.tos(r.tos'high DOWNTO 1);
                       END IF;

      WHEN op_DSHIFT => IF  with_mult  THEN
                          pop_stack;
                          multiplier   <= '0' & tos_power2;
                          IF  r.tos(r.tos'high) = '0'  THEN
                             multiplicand <= '0' & dst_din;
                          ELSE
                             multiplicand <= '0' & r.nos;
                          END IF;
                          r_in.nos <= product(data_width-1 DOWNTO 0);
                          r_in.tos <= product(data_width*2-1 DOWNTO data_width);
      -- bit wise shift with <NEXT
                       ELSIF  s.fwrd = '0'  THEN
                          r_in.tos <= r.tos(r.tos'high-1 DOWNTO 0) & r.nos(r.nos'high);
                          r_in.nos <= r.nos(r.nos'high-1 DOWNTO 0) & '0';
                          s_in.c <= r.tos(r.tos'high);
                       ELSE
                          r_in.tos <= '0' & r.tos(r.tos'high DOWNTO 1);
                          r_in.nos <= r.tos(0) & r.nos(r.nos'high DOWNTO 1);
                          s_in.c <= r.nos(0);
                       END IF;

      WHEN op_DASHIFT => IF  with_mult  THEN
                          pop_stack;
                          multiplier   <= '0' & tos_power2;
                          IF  r.tos(r.tos'high) = '0'  THEN
                             multiplicand <= '0' & dst_din;
                          ELSE
                             multiplicand <= r.nos(r.nos'high) & r.nos;
                          END IF;
                          r_in.nos <= product(data_width-1 DOWNTO 0);
                          r_in.tos <= product(data_width*2-1 DOWNTO data_width);
      -- bit wise shift with <NEXT
                       ELSIF  s.fwrd = '0'  THEN
                          r_in.tos <= r.tos(r.tos'high-1 DOWNTO 0) & r.nos(r.nos'high);
                          r_in.nos <= r.nos(r.nos'high-1 DOWNTO 0) & '0';
                          s_in.c <= r.tos(r.tos'high);
                       ELSE
                          r_in.tos <= r.tos(r.tos'high) & r.tos(r.tos'high DOWNTO 1);
                          r_in.nos <= r.tos(0) & r.nos(r.nos'high DOWNTO 1);
                          s_in.c <= r.nos(0);
                       END IF;

--      WHEN op_ROTATE => IF  with_mult  THEN
--                          multiplicand <= '0' & r.nos;
--                          multiplier   <= '0' & tos_power2;
--                          r_in.nos <= product(data_width-1 DOWNTO 0);
--                          r_in.tos <= product(data_width*2-1 DOWNTO data_width);
--                       ELSE
--                          NULL;
--                       END IF;
-- op_ROTATE      Op: (rotate ( n1 n2 -- d )     don't
--             Macro: rotate  ( n1 n2 -- n3 )    ?comp T (rotate or H ;

      WHEN op_PACK  => pop_stack;
                       r_in.tos <= r.tos(r.tos'high-8 DOWNTO 0) & r.nos(7 DOWNTO 0);
   
      WHEN op_UNPACK => push_stack;
                       r_in.nos <= (slice('0', data_width-8) & r.tos(7 DOWNTO 0));
                       r_in.tos <= "00000000" & r.tos(r.tos'high DOWNTO 8);

------------------------------------------------------------------------------
-- complex arithmetic
------------------------------------------------------------------------------

      WHEN op_UMULT => -- unsigned multiply, step instruction when mult-hardware not available
                       IF  with_mult  THEN
                          multiplicand <= '0' & r.nos;
                          multiplier   <= '0' & r.tos;
                          r_in.tos <= product(data_width*2-1 DOWNTO data_width);
                          r_in.nos <= product(data_width-1   DOWNTO          0);
                       ELSE -- multiply step instruction
                          add_x <= r.tos(r.tos'high-1 DOWNTO 0) & '0';
                          add_y <= dst_din;
                          IF  add_carry = '1' AND r.nos(r.nos'high) = '1'  THEN
                             r_in.nos <= (r.nos(r.nos'high-1 DOWNTO 0) & r.tos(r.tos'high)) + 1;
                          ELSE
                             r_in.nos <=  r.nos(r.nos'high-1 DOWNTO 0) & r.tos(r.tos'high);
                          END IF;
                          IF  r.nos(r.nos'high) = '0'  THEN
                             r_in.tos <= r.tos(r.tos'high-1 DOWNTO 0) & '0';
                          ELSE
                             r_in.tos <= sum;
                          END IF;
                       END IF;

      WHEN op_SMULT => -- signed multiply when mult-hardware available
                       IF  with_mult  THEN
                          multiplicand <= r.nos(r.nos'high) & r.nos;
                          multiplier <= r.tos(r.tos'high) & r.tos;
                          r_in.tos <= product(data_width*2-1 DOWNTO data_width);
                          r_in.nos <= product(data_width-1   DOWNTO          0);
                       ELSE
                          NULL;
                       END IF;

      WHEN op_MULTL => -- half precision final multiply step setting overflow
                       pop_stack;
                       IF  (zero = '1' AND r.nos(r.nos'high) = '0') OR (r.tos = -1 AND r.nos(r.nos'high) = '1')  THEN
                          overflow := '0';
                       ELSE
                          overflow := '1';
                       END IF;
                       s_in.ovfl <= overflow;

      WHEN op_DIV   => -- signed and unsigned division step, once for every bit
                       ladd_x <= (s.c OR s.ovfl) & r.tos;
                       ladd_y <= NOT('0' & dst_din);
                       cin <= '1';  -- sum = remainder - divisor
                       r_in.nos <= r.nos(r.nos'high-1 DOWNTO 0) & div_carry;
                       IF  div_carry = '1'  THEN
                          r_in.tos <= sum(r.tos'high-1 DOWNTO 0) & r.nos(r.nos'high);
                          s_in.c <= sum(r.tos'high);
                       ELSE
                          r_in.tos <= r.tos(r.tos'high-1 DOWNTO 0) & r.nos(r.nos'high);
                          s_in.c <= r.tos(r.tos'high);
                       END IF;
                       s_in.ovfl <= NOT (div_carry XOR div_sign) OR s.ovfl;

      WHEN op_UDIVS => -- first unsigned division step
                       r_in.tos <= r.nos;    -- dividend_high -> tos
                       r_in.nos <= dst_din;  -- dividend_low -> nos
                       dst_dout <= r.tos;    -- divisor -> dst_din
                       IF  r.tos = 0  THEN
                          IF  dst_din = 0 AND r.nos = 0  THEN  -- special case: 0 / 0 = zero, no overflow
                             dst_dout(0) <= '1';
                          ELSE
                             s_in.ovfl <= '1';
                          END IF;
                       END IF;
                       dst_wr <= core_en;
                       s_in.ovfl <= '0';
                       s_in.c <= '0';

      WHEN op_UDIVL => -- last unsigned division step
                       pop_stack;
                       ladd_x <= (s.c OR s.ovfl) & r.tos;
                       ladd_y <= NOT('0' & dst_din);
                       cin <= '1';  -- sum = remainder - divisor
                       IF  div_carry = '1'  THEN
                          r_in.nos <= sum;
                       ELSE
                          r_in.nos <= r.tos;
                       END IF;
                       r_in.tos <= r.nos(r.nos'high-1 DOWNTO 0) & div_carry;
                       s_in.ovfl <= r.nos(r.nos'high) OR s.ovfl;
   
      WHEN op_SDIVS => -- first signed division step with signed divisor
                       -- dup >r   abs >r   dup 0< IF  r@ +  THEN  r> um/mod
                       -- r@ 0< IF  negate over IF  swap r@ + swap 1-  THEN THEN  rdrop
                       r_in.nos <= dst_din;     -- dividend_low -> NOS
                       dst_dout <= abs(r.tos);  -- |divisor| in dst_din
                       dst_wr <= core_en;
                       add_x <= abs(r.tos);
                       add_y <= r.nos;
                       cin <= '0';
                       IF  r.nos(r.nos'high) = '1'  THEN  -- negative dividend?
                          r_in.tos <= sum;                -- dividend_high with pre-distortion on negative
                       ELSE
                          r_in.tos <= r.nos;
                       END IF;
                       s_in.c <= '0';
                       s_in.ovfl <= '0';
                       IF  dst_dout(dst_dout'high) = '1'  THEN  -- |Divisor| = $80000000 !!
                          s_in.ovfl <= '1';
                       END IF;
                       s_in.div <= r.tos(r.tos'high);
                       s_in.den <= r.nos(r.nos'high);

      WHEN op_SDIVL => -- last signed division step with signed divisor
                       -- dup >r   abs >r   dup 0< IF  r@ +  THEN  r> um/mod
                       -- r@ 0< IF  negate over IF  swap r@ + swap 1-  THEN THEN  rdrop
                       pop_stack;
                       ladd_x <= (s.c OR s.ovfl) & r.tos;
                       ladd_y <= NOT('0' & dst_din);
                       cin <= '1';  -- sum = remainder - divisor
                       IF  div_carry = '1'  THEN
                          temp := sum;   -- sum = remainder - divisor
                       ELSE
                          temp := r.tos; -- dividend_high, now remainder
                       END IF;
                       r_in.nos <= temp;
                       r_in.tos <= r.nos(r.nos'high-1 DOWNTO 0) & div_carry;
                       IF  s.div = '1'  THEN
                          r_in.tos <= NOT (r.nos(r.nos'high-1 DOWNTO 0) & div_carry) + 1;
                          IF  temp /= 0  THEN
                             r_in.tos <= NOT (r.nos(r.nos'high-1 DOWNTO 0) & div_carry);
                             r_in.nos <= temp - dst_din;
                          END IF;
                       END IF;
                       -- evaluate overflow bit
                       IF    (r_in.tos(r.tos'high) = '1' AND r_in.tos(r.tos'high-1 DOWNTO 0) = 0 AND (s.div XOR s.den) = '0')
                          OR s.ovfl = '1'
                          OR ((s.den XOR s.div XOR r_in.tos(r.tos'high)) = '1' AND r_in.tos /= 0)
                          OR r.nos(r.nos'high) = '1'
                       THEN
                          s_in.ovfl <= '1';
                       END IF;

-- : round   ( dm -- m' )   tuck invert 1 and IF  0< - exit  THEN  drop ;
-- : log2 ( frac -- log2[frac] )   \ Bit-wise Logarithm (K.Schleisiek/U.Lange)
--    #delta_width 0 ?DO  2*  LOOP
--    0   data_width 0
--    DO  2* >r   dup um*
--       dup 0< IF  r> 1+ >r  ELSE  d2*  THEN     \ correction of 'B(i)' and 'A(i)'
--       round   r>                               \ A(i+1):=A(i)*2^(B(i)-1)
--    LOOP  nip
-- ;
      WHEN op_LOGS  => -- log2 bit step
                       IF  with_mult  THEN
                          multiplicand <= '0' & r.nos;
                          multiplier   <= '0' & r.nos;              -- nos ** 2
                          IF  product(data_width*2-1) = '0'  THEN   -- then shift nos left
                             IF  product(data_width-1) = '0'  THEN
                                r_in.nos <= product(data_width*2-2 DOWNTO data_width) & product(data_width-2); -- round towards odd
                             ELSE
                                r_in.nos <= product(data_width*2-2 DOWNTO data_width-1);
                             END IF;
                          ELSE -- product(data_width*2-1) = '1'     -- don't shift nos
                             IF  product(data_width) = '0'  THEN
                                r_in.nos <= product(data_width*2-1 DOWNTO data_width+1) & product(data_width-1);  -- round towards off
                             ELSE
                                r_in.nos <= product(data_width*2-1 DOWNTO data_width);
                             END IF;
                          END IF;
                          r_in.tos <= r.tos(data_width-2 DOWNTO 0) & product(data_width*2-1);
                       ELSE
                          NULL;
                       END IF;

-- : sqrt    ( u -- urem uroot )
--    0 tuck   data_width 2/
--    ?FOR  d2* d2* swap >r   swap 2* 2* 1+
--          2dup - 0< 0= IF  tuck - swap 2 +  THEN
--          u2/ swap r> swap
--    NEXT  nip swap
-- ;
      WHEN op_SQRTS => -- square root 2bits step
                       -- root accumulated in dst_dout
                       -- square decimated in NOS
                       -- remainder in TOS
                       add_x <= r.tos(data_width-3 DOWNTO 0) & r.nos(data_width-1 DOWNTO data_width-2);
                       add_y <= NOT (dst_din(data_width-3 DOWNTO 0) & "01"); -- 2s complement subtract
                       cin <= '1';
                       IF  sum(data_width-1) = '1'  THEN
                          r_in.tos <= r.tos(data_width-3 DOWNTO 0) & r.nos(data_width-1 DOWNTO data_width-2);
                          dst_dout <= dst_din(data_width-2 DOWNTO 0) & '0';
                       ELSE
                          r_in.tos <= sum;
                          dst_dout <= dst_din(data_width-2 DOWNTO 0) & '1';
                       END IF;
                       r_in.nos <= r.nos(data_width-3 DOWNTO 0) & "00";
                       dst_wr <= core_en;

------------------------------------------------------------------------------
-- floating point
------------------------------------------------------------------------------

      WHEN op_FMULT => -- fractional signed multiply with standard rounding towards even for .5
                       IF  with_float AND with_mult  THEN
                          pop_stack;
                          multiplicand <= r.nos(r.nos'high) & r.nos;
                          multiplier   <= '0' & r.tos;
                          r_in.tos <= product(data_width*2-1 DOWNTO data_width);
                          IF  product(data_width-1) = '1' AND (product(data_width-2 DOWNTO 0) /= 0  OR  product(data_width) = '1')  THEN  -- round 0.5 to even
                             r_in.tos <= product(data_width*2-1 DOWNTO data_width) + 1;
                          END IF;
                       ELSE
                          call_trap(i_usr);
                       END IF;

-- : normalized?  ( m -- f )   dup #signbit and swap #signbit u2/ and 2* xor ;
-- : normalize    ( m e -- m' e' )
--    over normalized? ?EXIT
--    over 0= IF  drop   #exp_min  EXIT THEN
--    BEGIN  dup #exp_min = ?EXIT
--           1 - swap 2* swap  over normalized?
--    UNTIL
-- ;
-- op_NORM is a single step instruction to be used in >FOR (norm <NEXT loop
      WHEN op_NORM  => -- normalize a 2s-complement matissa/exponent number pair on the stack
                       IF  with_float  THEN
                          r_in.tor <= (OTHERS => '0');  -- default: finish <NEXT loop
                          IF  r.nos = 0  THEN
                             r_in.tos <= slice('1', data_width - exp_width + 1) & slice('0', exp_width-1);
                          ELSIF  r.nos(data_width-1) /= r.nos(data_width-2) OR r.tos = exp_min  THEN  -- already properly formatted or minimum exponent reached
                             NULL;
                          ELSE
                             r_in.tos <= r.tos - 1;
                             r_in.nos <= r.nos(r.nos'high-1 DOWNTO 0) & '0';
                             IF  r.nos(r.nos'high) = r.nos(r.nos'high-2)  THEN
                                r_in.tor <= r.tor;      -- restore repeat count
                             END IF;
                          END IF;
                       ELSE
                          call_trap(i_usr);
                       END IF;

-- : >float  ( m e -- r )   overflow off   underflow off
--    normalize   swap #man_mask and swap
--    2dup #exp_min =   swap #fzero_neg =   and >r
--    over #fzero_pos =   r> or
--    IF  drop  #exp_mask invert and  EXIT THEN             \ leave floating +/-zero. For +zero irrespective of exponent
--    dup #man_mask 2/ and
--    dup 0< IF  #man_mask 2/ xor  THEN                     \ exponent over/underflow?
--    IF  0< IF  underflow on   0< IF  #fzero_neg  EXIT THEN  #fzero_pos  EXIT THEN
--        overflow on   0< IF  #fmax_neg  EXIT THEN  #fmax_pos  EXIT
--    THEN
--    dup #exp_min = IF  drop #man_mask and  EXIT THEN      \ smallest exponent => denormalized
--    #exp_mask and   #exp_sign xor   swap                  \ flip sign of exponent => bias = #exp_min
--    dup 2* [ #signbit invert #exp_mask invert and ] Literal and
--    swap 0< IF  #signbit or  THEN  or
-- ;
      WHEN op_FLOAT => -- convert 2s-complement mantissa/exponent pair to floating point number
                       IF  with_float  THEN
                          s_in.ovfl <= '0';
                          s_in.unfl <= '0';
                          pop_stack;
                          IF  r.nos = zero_pos OR (r.nos = zero_neg AND r.tos = exp_min)  THEN
                             r_in.tos <= r.nos(data_width-1 DOWNTO exp_width) & slice('0', exp_width);             -- leave floating +/-zero. For +zero irrespective of exponent
                          -- exponent within range?
                          ELSIF  r.tos(data_width-1 DOWNTO exp_width-1) = -1 OR r.tos(data_width-1 DOWNTO exp_width-1) = 0  THEN
                             IF  r.tos = exp_min   THEN                                                            -- minimum exponent?
                                r_in.tos <= r.nos(data_width-1 DOWNTO exp_width) & slice('0', exp_width);          -- denormalized number
                             ELSE
                                r_in.tos <= r.nos(data_width-1) & r.nos(data_width-3 DOWNTO exp_width-1) & fexp;   -- normalized number
                             END IF;
                          -- exponent out of range
                          ELSIF  r.tos(data_width-1) = '0'  THEN  -- positiv exponent?
                             s_in.ovfl <= '1';
                             IF  r.nos(data_width-1) = '0'  THEN  -- positiv mantissa?
                                r_in.tos <= fmax_pos;
                             ELSE
                                r_in.tos <= fmax_neg;
                             END IF;
                          ELSE -- negative exponent
                             s_in.unfl <= '1';
                             IF  r.nos(data_width-1) = '0'  THEN  -- positiv mantissa?
                                r_in.tos <= zero_pos;
                             ELSE
                                r_in.tos <= zero_neg;
                             END IF;
                          END IF;
                       ELSE
                          call_trap(i_usr);
                       END IF;
          
-- : float>  ( r -- m e )
--    dup #exp_mask and   ?dup 0= IF  #exp_min  EXIT THEN                                \ de-normalized
--    dup #exp_sign and IF  #exp_mask 2/ and  ELSE  #exp_mask 2/ invert or  THEN  swap   \ flip sign and extend
--    dup 0< IF  #exp_mask 2/ or  2/ [ #signbit #exp_sign or u2/ invert ] Literal and    \ add 0.5 for better rounding
--         ELSE  #man_mask   and u2/ [ #signbit #exp_sign or u2/        ] Literal or     \ add 0.5 for better rounding
--         THEN  swap
-- ;
      WHEN op_INTEG => -- convert floating point number to 2s-complement mantissa/exponent pair
                       IF  with_float  THEN
                          push_stack;
                          r_in.tos <= slice(fexp(fexp'high), data_width - exp_width) & fexp;
                          IF  r.tos(exp_width-1 DOWNTO 0) = 0  THEN  -- de-normalized or zero
                             r_in.nos <= r.tos;
                          ELSIF  r.tos(data_width-1) = '0'  THEN
                             r_in.nos <= "01" & r.tos(data_width-2 DOWNTO exp_width) & '1' & slice('0', exp_width-2);  -- add 0.5 for rounding
                          ELSE
                             r_in.nos <= "10" & r.tos(data_width-2 DOWNTO exp_width) & '0' & slice('1', exp_width-2);  -- add 0.5 for rounding
                          END IF;
                       ELSE
                          call_trap(i_usr);
                       END IF;

------------------------------------------------------------------------------
-- user trap instructions of otherwise unused opcode
------------------------------------------------------------------------------

      WHEN OTHERS   => IF  r.inst(7 DOWNTO 5) = op_USR(7 DOWNTO 5)  THEN
                          call_trap(i_usr);
                       END IF;
      END CASE;
   
      IF  sat_arith AND overflow = '1'  THEN
          IF  r.tos(r.tos'high) = '1'  THEN
             r_in.tos <= '1' & slice('0', data_width-1);
          ELSE
             r_in.tos <= '0' & slice('1', data_width-1);
          END IF;
      END IF;
   
   END IF;

END PROCESS uCore_control;

END rtl;