mandelph.c

/* -----------------------------------------------------------------------------
 *  Copyright (C) 2019-2021 daiteq s.r.o.                http://www.daiteq.com
 *
 *  This program is distributed WITHOUT ANY WARRANTY; without even
 *  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.
 *
 * -----------------------------------------------------------------------------
 *  Filename    : mandelph.c
 *  Authors     : Martin Danek
 *  Description : Mandelbrot set with packed half-precision ops
 *  Release     :
 *  Version     : 1.5
 *  Date        : 27.4.2021
 * -----------------------------------------------------------------------------
 */

#include <stdint.h>
#include <leon-myuart.h>
#include <my_printf.h>

#include <sfmath.h>
#include <sysregs.h>
#include <lconf.h>

/* for clearing BSS section */
extern unsigned long __bss_start;  /* start address for the .bss section. defined in linker script */
extern unsigned long _end;         /* end address for the .bss section. defined in linker script */

#define NOIOTIME

#if defined(PACKED)

  typedef half cpxh_t __attribute__((ext_vector_type(2)));

#else

  typedef struct cpxh_t {
    half x;
    half y;
  } cpxh_t;

#endif


#if defined(PACKED)

#define opcode_FADDRPH(A, B, RES)   \
  __asm__ __volatile__ (       \
     "faddrph\t%1, %2, %0 \n" \
      : "=f" (RES)                                                             \
      : "If" (A), "If" (B)                                                     \
  )


#define opcode_FMULXPH(A, B, RES)   \
  __asm__ __volatile__ (       \
     "fmulxph\t%1, %2, %0 \n" \
      : "=f" (RES)                                                             \
      : "If" (A), "If" (B)                                                     \
  )


  #define opcode_FSUBADDRPH(A, B, RES)   \
  __asm__ __volatile__ (       \
      "fsubaddrph\t%1, %2, %0 \n" \
      : "=f" (RES)          \
      : "If" (A), "If" (B)  \
  )

#define opcode_FSUBRPH(A, B, RES)   \
  __asm__ __volatile__ (       \
     "fsubrph\t%1, %2, %0 \n" \
      : "=f" (RES)                                                             \
      : "If" (A), "If" (B)                                                     \
  )


#define opcode_FMOVHZU(A, RES)   \
  __asm__ __volatile__ (       \
     "fmovhzu\t%1, %0 \n" \
      : "=f" (RES)                                                             \
      : "f" (A)                                                                \
  )

#define opcode_FMULRPH(A, B, RES)   \
  __asm__ __volatile__ (       \
     "fmulrph\t%1, %2, %0 \n" \
      : "=f" (RES)                                                             \
      : "If" (A), "If" (B)                                                     \
  )

#endif


#if defined(PACKED)

#define dcadd(a,b,z) \
  z=a+b;

#define dcsub(a,b,z) \
  z=a-b;


// 1. C = FMULP(A,B) = (ar*br, ai*bi)
// 2. D = FMULXP(A,B) = (ar*bi, ai*br)
// 3. H = FSUBADDRP(C,D) = (ar*br-ai*bi, ar*bi+ai*br)
#define dcmul(a,b,z) \
  do { \
  cpxh_t tc, td; \
  tc=a*b; \
  opcode_FMULXPH(a,b,td); \
  opcode_FSUBADDRPH(tc,td,z); \
  } while (0)

#define dcabss(a,z) \
  do { \
  cpxh_t th={0.h,0.h}; \
  cpxh_t tc, td; \
  tc=a*a; \
  opcode_FADDRPH(tc,th,td); \
  z=td.x; \
  } while (0)


#else

#define dcadd(a,b,z) \
    do { \
    z.x=a.x+b.x; \
    z.y=a.y+b.y; \
    } while (0)

#define dcsub(a,b,z) \
    do { \
    z.x=a.x-b.x; \
    z.y=a.y-b.y; \
    } while (0)


#define dcmul(a,b,z) \
    do { \
    cpxh_t th; \
    th.x=a.x*b.x-a.y*b.y; \
    th.y=a.y*b.x+a.x*b.y; \
    z=th; \
    } while (0)

#define dcabss(a,z) \
    do { \
    z=a.x*a.x+a.y*a.y; \
    } while (0)


#endif


#if defined(FCALL)

cpxh_t cadd(cpxh_t a, cpxh_t b) {
  cpxh_t c;
#if defined(PACKED)
  c=a+b;
#else
  c.x=a.x+b.x;
  c.y=a.y+b.y;
#endif
  return c;
}

cpxh_t csub(cpxh_t a, cpxh_t b) {
  cpxh_t c;
#if defined(PACKED)
  c=a-b;
#else
  c.x=a.x-b.x;
  c.y=a.y-b.y;
#endif
  return c;
}


cpxh_t cmul(cpxh_t a, cpxh_t b) {
// 1. C = FMULP(A,B) = (ar*br, ai*bi)
// 2. D = FMULXP(A,B) = (ar*bi, ai*br)
// 3. E = FADDRP(D,0) = (ar*bi+ai*br, 0)
// 4. F = FSUBRP(C,0) = (ar*br-ai*bi, 0)
// 5. G = FMOVHZU(E) = (0, ar*bi+ai*br)
// 6. H = FADDP(F,G) = (ar*br-ai*bi, ar*bi+ai*br)

  cpxh_t x,y,c,d,e,f,g,h;
  const cpxh_t zero={0.H,0.H};
#if defined(PACKED)
  x=a;
  y=b;
  c=x*y;
  opcode_FMULXPH(x,y,d);
  opcode_FSUBADDRPH(c,d,h);
#else
  h.x=a.x*b.x-a.y*b.y;
  h.y=a.y*b.x+a.x*b.y;
#endif
  return h;
}

half cabss(cpxh_t a) {
  cpxh_t x,c,d;
  const cpxh_t zero={0.h,0.h};
#if defined(PACKED)
  c=a*a;
  opcode_FADDRPH(c,zero,d);
#else
  d.x=a.x*a.x+a.y*a.y;
#endif
  return d.x;
}
#endif


#define RES_X 80
#define RES_Y 80
#define MAX_ITER 100

#define MAX_COL 80
#define MAX_ROW 80

#define printpacked(A) \
  c=A.x; \
  d=A.y; \
  printf(" (%f,%f)",c,d);


int main(void) {

    half a,b,c,d;
    cpxh_t pa,pb,pc,point,next;

    int i, j;
    unsigned k,l,m,hit;

  /* reset BSS section to zero */
  for(unsigned long *pMem = &__bss_start; pMem < &_end;) {
    *(pMem++) = 0;
  }

  init_uart();

  init_softfloat();

#ifndef NOHWCONF
  do {
    lconf_type lconf;
    lconf_fpu_type lconf_fpu;
    lconf_swar_type lconf_swar;
    get_lconf(&lconf);
    get_lconf_fpu(&lconf_fpu);
    get_lconf_swar(&lconf_swar);
    report_lconf(&lconf);
    if ((lconf.fpu==FPU_DAIFPU)||(lconf.fpu==FPU_MEIKO))
      report_lconf_fpu(&lconf_fpu);
    report_lconf_swar(&lconf_swar);
    sysregs_init();
  } while(0);
#endif /* NOHWCONF */

#if defined (PACKED)
    m_print("MANDEL - HALF PRECISION, PACKED\n");
#else
    m_print("MANDEL - HALF PRECISION, NORMAL\n");
#endif

// #define INIT_TEST
#if defined (INIT_TEST)
    a = 3.0h;
    b = 4.0h;

    pa.x = a;
    pa.y = b;
    printf("A=(%f,%f)\n",pa.x,pa.y);

    pb.x = 0.h;
    pb.y = 1.h;
    printf("B=(%f,%f)\n",pb.x,pb.y);

#if defined(FCALL)
    pc=cadd(pa,pb);
#else
    dcadd(pa,pb,pc);
#endif
    m_print("pa+pb "); printpacked(pc); NL;
#if defined(FCALL)
    pc=csub(pa,pb);
#else
    dcsub(pa,pb,pc);
#endif
    m_print("pa-pb "); printpacked(pc); NL;
#if defined(FCALL)
    pc=cmul(pa,pb);
#else
    dcmul(pa,pb,pc);
#endif
    m_print("pa*pb "); printpacked(pc); NL;
#endif

#ifdef NOIOTIME
    sysregs_init();
#endif

    for (j=-RES_Y;j<RES_Y;j++) {
      for (i=-2*RES_X;i<RES_X;i++) {
        point.x=((half)i)/(((half)RES_X)*1.h);point.y=((half)j)/(((half)RES_X)*1.h);
        k=0;
        next.x=0.h;
        next.y=0.h;
#if defined(FCALL)
        while ((k<MAX_ITER)&&(cabss(next)<64000.h)) {
#else
        dcabss(next,a);
        while ((k<MAX_ITER)&&(a<64000.h)) {
#endif
#if defined(FCALL)
          next=cadd(cmul(next,next),point);
#else
          dcmul(next,next,pa);
          dcadd(pa,point,next);
          dcabss(next,a);
#endif
          k++;
        }
        if (k==MAX_ITER) {
#ifdef NOIOTIME
          sysregs_stop();
#endif
          m_putchar('*');
#ifdef NOIOTIME
          sysregs_start();
#endif
        }
        else {
          if (k<10) {
#ifdef NOIOTIME
            sysregs_stop();
#endif
            m_putchar('0'+k);
#ifdef NOIOTIME
            sysregs_start();
#endif
          }
          else {
#ifdef NOIOTIME
            sysregs_stop();
#endif
            m_putchar('A'+((k>35)?35:k)-10);
#ifdef NOIOTIME
            sysregs_start();
#endif
          }
        }
      }
      NL;
    }

    NL;


#ifndef NOHWCONF
    do {
      unsigned ticks_lo, ticks_hi, insns_lo, insns_hi;
      sysregs_stop();
      sysregs_read(&ticks_lo, &ticks_hi, &insns_lo, &insns_hi);
      if (ticks_hi)
        printf("TEST EXECUTED IN (%u * 2^32 + %u) TICKS AND (%u * 2^32 + %u) INSTRUCTIONS\n\n", ticks_hi, ticks_lo, insns_hi, insns_lo);
      else
        printf("TEST EXECUTED IN %u TICKS AND %u INSTRUCTIONS\n\n", ticks_lo, insns_lo);
    } while(0);
#endif /* NOHWCONF */


    m_print("\n\nEND OF TEST\n\n");

    __asm__("ta 1; nop; nop;");

    return 0;
}