From GMP programs to hardware

Some time ago I did a small experiment to automatically generate RTL from GMP (GNU multi-precision integer) programs (as inĀ gmplib.org). These programs use API calls to perform arbitrary-precision integer arithmetic for use in cryptography, conducting/supporting mathematical proofs and for purposes of scientific computing in general.

I have developed a concept implementation that is part of HercuLeS HLS (http://www.nkavvadias.com/hercules/). Essentially it is a set of additional features to HercuLeS. I actually used the fgmp (public domain) implementation and API, due to GMP’s licensing. The extensions (coded within a few days or so) did not yet generate synthesizable code, but I was able to generate simulation HDL (VHDL) with gmp API calls as proof-of-concept.

In order to support for the basic GMP (multi-precision integer) API/DSL for just addition and multiplication, the following additions were needed:

  • new VHDL package, mpint.vhd: MP integer and SLV (standard logic vector) operators: 500 lines of code (LOC)
  • C to N-Address Code frontend extensions: 50 LOC
  • Backend code generation in HercuLes: 150 LOC (in addition to what is already there)

This proof-of-concept frontend for GNU multi-precision integer programs has been implemented along with a VHDL IP library for key library functionalities such as mpz_set, mpz_add and mpz_mul.

A simple example

The following code implements factorial with “`fgmp“`, which is under the public domain:


MP_INT gmp_fact(unsigned int n)
{
  MP_INT prod;
  int i;
  mpz_init(&prod);
  mpz_set_si(&prod, 1);
  for (i = 2; i <= n; i++) {
    mpz_mul_ui(&prod, &prod, i);
  }
  return (prod);
}

This is the unoptimized FSMD in VHDL:


library IEEE;
use WORK.operpack.all;
use WORK.gmp_fact_cdt_pkg.all;
use STD.textio.all;
use WORK.std_logic_textio.all;
use WORK.mpint.all;
use WORK.std_logic_1164_tinyadditions.all;
use IEEE.std_logic_1164.all;
use IEEE.numeric_std.all;

entity gmp_fact is
port (
  clk : in std_logic;
  reset : in std_logic;
  start : in std_logic;
  n : in std_logic_vector(31 downto 0);
  D_2747 : out std_logic_vector(31 downto 0);
  done : out std_logic;
  ready : out std_logic
);
end gmp_fact;

architecture fsmd of gmp_fact is
  type state_type is (S_ENTRY, S_EXIT, S_001_001, S_001_002, S_001_003, S_001_004, S_001_005, S_002_001, S_002_002, S_002_003, S_002_004, S_002_005, S_003_001, S_003_002, S_004_001, S_004_002);
  signal current_state, next_state: state_type;
  signal i_21_next : std_logic_vector(31 downto 0);
  signal i_21_reg : std_logic_vector(31 downto 0);
  signal m_11_next : std_logic_vector(31 downto 0);
  signal m_11_reg : std_logic_vector(31 downto 0);
  signal i_11_next : std_logic_vector(31 downto 0);
  signal i_11_reg : std_logic_vector(31 downto 0);
  signal prod_next : mp_int;
  signal prod_reg : mp_int;
  signal i_next : std_logic_vector(31 downto 0);
  signal i_reg : std_logic_vector(31 downto 0);
  signal prod_11_next : mp_int;
  signal prod_11_reg : mp_int;
  signal prod_21_next : mp_int;
  signal prod_21_reg : mp_int;
  signal prod_12_next : mp_int;
  signal prod_12_reg : mp_int;
  signal i_2_31_next : std_logic_vector(31 downto 0);
  signal i_2_31_reg : std_logic_vector(31 downto 0);
  signal i_1_21_next : std_logic_vector(31 downto 0);
  signal i_1_21_reg : std_logic_vector(31 downto 0);
  signal D_2747_next : std_logic_vector(31 downto 0);
  signal D_2747_reg : std_logic_vector(31 downto 0);
  constant CNST_0 : std_logic_vector(63 downto 0) := "0000000000000000000000000000000000000000000000000000000000000000";
  constant CNST_1 : std_logic_vector(63 downto 0) := "0000000000000000000000000000000000000000000000000000000000000001";
  constant CNST_2 : std_logic_vector(63 downto 0) := "0000000000000000000000000000000000000000000000000000000000000010";
begin
-- current state logic
process (clk, reset)
begin
  if (reset = '1') then
    current_state <= S_ENTRY;
    i_21_reg <= (others => '0');
    m_11_reg <= (others => '0');
    i_11_reg <= (others => '0');
    prod_reg.data <= (others => 0);
    i_reg <= (others => '0');
    prod_11_reg.data <= (others => 0);
    prod_21_reg.data <= (others => 0);
    prod_12_reg.data <= (others => 0);
    i_2_31_reg <= (others => '0');
    i_1_21_reg <= (others => '0');
    D_2747_reg <= (others => '0');
  elsif (clk = '1' and clk'EVENT) then
    current_state <= next_state;
    i_21_reg <= i_21_next;
    m_11_reg <= m_11_next;
    i_11_reg <= i_11_next;
    prod_reg <= prod_next;
    i_reg <= i_next;
    prod_11_reg <= prod_11_next;
    prod_21_reg <= prod_21_next;
    prod_12_reg <= prod_12_next;
    i_2_31_reg <= i_2_31_next;
    i_1_21_reg <= i_1_21_next;
    D_2747_reg <= D_2747_next;
  end if;
end process;

-- next state and output logic
process (current_state, start,
  n,
  D_2747_reg,
  i_21_reg, i_21_next,
  m_11_reg, m_11_next,
  i_11_reg, i_11_next,
  prod_reg, prod_next,
  i_reg, i_next,
  prod_11_reg, prod_11_next,
  prod_21_reg, prod_21_next,
  prod_12_reg, prod_12_next,
  i_2_31_reg, i_2_31_next,
  i_1_21_reg, i_1_21_next
)
begin
  done <= '0';
  ready <= '0';
  i_21_next <= i_21_reg;
  m_11_next <= m_11_reg;
  i_11_next <= i_11_reg;
  prod_next <= prod_reg;
  i_next <= i_reg;
  prod_11_next <= prod_11_reg;
  prod_21_next <= prod_21_reg;
  prod_12_next <= prod_12_reg;
  i_2_31_next <= i_2_31_reg;
  i_1_21_next <= i_1_21_reg;
  D_2747_next <= D_2747_reg; 
  case current_state is 
    when S_ENTRY =>
      ready <= '1';
      if (start = '1') then
        next_state <= S_001_001;
      else
        next_state <= S_ENTRY; 
      end if; 
    when S_001_001 =>
      m_11_next <= CNST_1(31 downto 0);
      next_state <= S_001_002; 
    when S_001_002 =>
      prod_12_next <= mpz_set_ui(m_11_reg);
      next_state <= S_001_003; 
    when S_001_003 =>
      i_11_next <= CNST_2(31 downto 0);
      prod_next <= prod_12_reg;
      next_state <= S_001_004; 
    when S_001_004 =>
      i_next <= i_11_reg(31 downto 0);
      next_state <= S_001_005; 
    when S_001_005 =>
      next_state <= S_003_001; 
    when S_002_001 =>
      i_1_21_next(31 downto 0) <= i_reg;
      next_state <= S_002_002; 
    when S_002_002 =>
      prod_21_next <= mpz_mul_ui(prod_reg, i_1_21_reg);
      next_state <= S_002_003; 
    when S_002_003 =>
      mpz_printh(prod_21_reg);
      next_state <= S_002_004; 
    when S_002_004 =>
      i_21_next <= std_logic_vector(signed(i_reg) + signed(CNST_1(31 downto 0)));
      next_state <= S_002_005; 
    when S_002_005 =>
      prod_next <= prod_21_reg;
      i_next <= i_21_reg(31 downto 0);
      next_state <= S_003_001; 
    when S_003_001 =>
      i_2_31_next(31 downto 0) <= i_reg;
      next_state <= S_003_002; 
    when S_003_002 =>
      if (i_2_31_reg <= n(31 downto 0)) then
        next_state <= S_002_001;
      else
        next_state <= S_004_001; end if; 
    when S_004_001 =>
      D_2747_next <= CNST_0(31 downto 0);
      next_state <= S_004_002; 
    when S_004_002 =>
      next_state <= S_EXIT; 
    when S_EXIT =>
      done <= '1';
      next_state <= S_ENTRY; 
    when others =>
      next_state <= S_ENTRY;
  end case;
end process;

D_2747 <= D_2747_reg;

end fsmd;