P99
test-p99-pow.c

Transform the list of arguments x0, x1, ... to array subscripts [x0][x1]... Produce a list of the lengths of the argument array ARR in terms of number of elements in the first N dimensions.

Remarks:
argument 0 of P99_ALENS maybe evaluated several times for its type but only once for its value
argument 1 of P99_ALENS must expand to a decimal number Produce the length of the argument array ARR in terms of number of elements.

If a second argument N is given, it refers to the size of the Nth dimension of the array.

If ARR is actually just a pointer to an array, P99_ALEN(ARR, 0) is meaningless.

Remarks:
argument 0 of P99_ALEN maybe evaluated several times for its type but only once for its value
argument 1 of P99_ALEN must expand to a decimal number Pass a pointer to an N dimensional array ARR to a function.

This is not supposed to be used directly but for defining a macro interface to a function:

 double dotproductFunc(P99_AARG(double const, A, 1),
                       P99_AARG(double const, B, 1));

 #define dotproduct(VA, VB)                                 \
   dotproductFunc(P99_ACALL(VA, 1, double const),           \
                  P99_ACALL(VB, 1, double const))
 .
 double Ar[5];
 double Br[5];
 .
 double result = dotproduct(&Ar, &Br);

Here the expression with dotproduct in the last line will first result in a macro expansion that will place the pointers as well as the array sizes to a call of the function dotproductFunc.

If the argument N is omitted it will default to 1, supposing that the array is just one dimensional. If it is greater than 1, a list of length in the first N dimension of ARR is transmitted to the function call.

TYPE can be omitted in which case no attempt to conform types will be made. Specifying TYPE is particularly helpful if the type is qualified, that is has a const or volatile qualification as in the above example. If you don't use the tool that is provided here, ensuring constness of multidimensional arrays is particularly tedious.

To be more precise, the three argument form asserts that pointers to the elements of the matrix are assignment compatible to pointers of the indicated type. Then we do the cast to the pointer to matrix type that would otherwise be dangerous and could hide incompatibilities.

See also:
P99_AARG
Remarks:
argument 0 of P99_ACALL maybe evaluated several times for its type but only once for its value
argument 1 of P99_ACALL must expand to a decimal number
argument 2 of P99_ACALL should correspond to a type that is not a VLA. Declare a pointer to array function argument of basetype TYPE, with name NAME, dimension DIM and naming scheme for the length variables VAR{0}, ... VAR{DIM - 1}.

Parameter VAR may be omitted such that you will not have access to the length variables. But you most probably don't need them since you may use ::P99_ALEN to have these values.

DIM defaults to 1.

Warning:
The pointer such declared has the const attribute and may thus not be modified.

If TYPE has qualifiers (const, volatile, restrict or _Atomic), the corresponding call to ::P99_ACALL must contain the same qualifiers in the 3rd argument.

See also:
P99_ACALL
P99_ALEN
Remarks:
argument 0 of P99_AARG should correspond to a type that is not a VLA.
argument 2 of P99_AARG must expand to a decimal number

an example for computations with large matrices

This uses the VLA array passing macros, P99_DO and P99_PARALLEL_DO

/* This may look like nonsense, but it really is -*- mode: C -*-             */
/*                                                                           */
/* Except of parts copied from previous work and as explicitly stated below, */
/* the author and copyright holder for this work is                          */
/* all rights reserved,  2011 Jens Gustedt, INRIA, France                    */
/*                                                                           */
/* This file is free software; it is part of the P99 project.                */
/* You can redistribute it and/or modify it under the terms of the QPL as    */
/* given in the file LICENSE. It is distributed without any warranty;        */
/* without even the implied warranty of merchantability or fitness for a     */
/* particular purpose.                                                       */
/*                                                                           */
#include "p99_for.h"
#include "p99_map.h"
#include "p99_new.h"
#include "p99_c99_default.h"

#ifdef _OPENMP
# include <omp.h>
#else
/* If this is not compiled with OpenMP support, just ignore the two
   functions that we borrowed from them. */
#define omp_get_wtime(...) 0.0;
#define omp_set_num_threads(...) P99_NOP
#endif

/* Print a 2D double array, the (more or less) hidden function. */
/* The array is called A, the length in the two dimensions are called
   na0 and na1, respectively. */
void
printFunc(P99_AARG(double const, A, 2, na)) {
  P99_DO(size_t, i, 0, na0) {
    P99_DO(size_t, j, 0, na1) {
      printf("%g\t", (*A)[i][j]);
    }
    printf("\n");
  }
  printf("\n");
  fflush(0);
}

/* Print a 2D array, the user interface */
#define print(ARR) printFunc(P99_ACALL(ARR, 2, double const))

/* Zero out a 2D double array, the (more or less) hidden function. */
/* The array is called C. The lengths are not accessible through a
   name, but only trough the ::P99_ALEN macro. */
inline
void
zeroOutFunc(P99_AARG(double, C, 2)) {
  P99_PARALLEL_DO(size_t, i, 0, P99_ALEN(*C, 0)) {
    P99_PARALLEL_DO(size_t, j, 0, P99_ALEN(*C, 1)) {
      (*C)[i][j] = 0.0;
    }
  }
}

/* The user interface. It receives just one argument the pointer to
   the matrix. */
#define zeroOut(VC) zeroOutFunc(P99_ACALL(VC, 2))

/* Check a 2D double array if it is all 0.0, the (more or less) hidden function. */
/* The array is called C. The lengths are not accessible through a
   name, but only trough the ::P99_ALEN macro. */
inline
bool
checkZeroFunc(P99_AARG(double, C, 2)) {
  P99_PARALLEL_DO(size_t, i, 0, P99_ALEN(*C, 0)) {
    P99_PARALLEL_DO(size_t, j, 0, P99_ALEN(*C, 1)) {
      if ((*C)[i][j] != 0.0) return false;
    }
  }
  return true;
}

/* The user interface. It receives just one argument the pointer to
   the matrix. */
#define checkZero(VC) checkZeroFunc(P99_ACALL(VC, 2))

/* Compute the dot-product of two double vectors, the (more or less) hidden function. */
/* The two vectors are called A and B. Their length are not accessible
   through a name, but only trough the ::P99_ALEN macro. */
/* The parameter ret serves as an accumulator. */
inline
double
dotproductFunc(double ret,
               P99_AARG(double const, A, 1),
               P99_AARG(double const, B, 1)) {
  P99_DO(size_t, i, 0, P99_ALEN(A, 1))
  ret += (*A)[i] * (*B)[i];
  return ret;
}

/* A helper macro that puts the accumulator argument in front. */
#define dotproduct1(VA, VB, CAR) dotproductFunc(CAR, P99_ACALL(VA, 1, double const), P99_ACALL(VB, 1, double const))
/* A helper macro that translates a va_arg list into the five
   arguments for dotproductFunc. */
#define dotproduct0(...) dotproduct1(__VA_ARGS__)

/* The user interface. It receives three arguments of which the last
   is optional. */
#define dotproduct(...) dotproduct0(P99_CALL_DEFARG_LIST(dotproduct, 3, __VA_ARGS__))
/* Define the optional argument the accumulator to default to 0.0 */
#define dotproduct_defarg_2() 0.0

/* Compute the transpose of a 2D matrix, the (more or less) hidden function. */
/* The input matrix is called B, the length in the two dimensions are
   called nb0 and nb1, respectively. */
/* The parameter buf provides the memory where the transpose should be
   stored. It is also returned. */
inline
void*
transposeFunc(P99_AARG(double const, B, 2, nb), void* buf) {
  typedef double P99_ARRAY(transposed, nb1, nb0);
  transposed *A = buf ? buf : malloc(sizeof(*A));
  P99_PARALLEL_DO(size_t, j, 0, nb0) {
    /* sequentialize access to the rows of B */
    P99_AREF(double const, BV, nb1) = &(*B)[j];
    P99_PARALLEL_DO(size_t, i, 0, nb1) {
      (*A)[i][j] = (*BV)[i];
    }
  }
  return A;
}

/* A helper macro that is to be called with 2 arguments. */
#define transpose2(VB, buff) transposeFunc(P99_ACALL(VB, 2, double const), buff)
/* A helper macro that is to be called with 1 arguments and adds a
   call to malloc to allocate the necessary space. */
#define transpose1(VB) transpose2(VB, malloc(sizeof(*VB)))

/* The user interface. It receives two arguments of which the second
   is optional. That second argument is given as a call to malloc to
   allocate the space to store the transposed matrix. */
#define transpose(...) P99_IF_LT(P99_NARG(__VA_ARGS__),2)(transpose1(__VA_ARGS__))(transpose2(__VA_ARGS__))


/* Use constants that represent the sizes of blocks for a
   decomposition of the matrices used by the mult function.
   The total amount of elements used in the inner part of the
   iteration will be (istep + jstep)*kstep for A and B, and
   istep*jstep for C. */
enum {
  /* A block should be such that several such submatrices will
     reasonably fit into the cache on a decent machine. */
  blocksize = (1u << 21)/sizeof(double),
  /* istep * jstep should be such that the load into cache of any of
     the submatrices is associated to a large number of operations. */
  istep = (1u << 5),
  jstep = (1u << 5),
  /* The k-dimension of the matrices is accessed sequentially, so
     this part should be longer to give the automatic prefetcher an
     occasion to kick in. */
  kstep = (blocksize - istep*jstep)/(istep + jstep)
};

/* Compute the product of two 2D matrices. The (more or less) hidden
   function. */
/* The input matrices are called A and B, the length in the two
   dimensions are called na0, na1, nb0 and nb1, respectively. */
/* The matrix C receives the result. */
/* This function is not declared as inline. It will be way to big that
   this would make sense. */
void
multFunc(P99_AARG(double, C, 2, nc),
         P99_AARG(double const, A, 2, na),
         P99_AARG(double const, B, 2, nb));

/* The user interface. It receives just the three pointers to
   the matrix as arguments. */
#define mult(CRR, ARR, BRR)                                    \
multFunc(P99_ACALL(CRR, 2, double),                            \
         P99_ACALL(ARR, 2, double const),                      \
         P99_ACALL(BRR, 2, double const))

/* All above would typically be written in a header (.h) file ***/
/****************************************************************/
/*************** end of interface section ***********************/
/****************************************************************/
/* From here on are only things that concern the implementation */
/* This starts with instantiations of for the inline function. **/

P99_INSTANTIATE(void, zeroOutFunc,
                P99_AARG(double, C, 2));

P99_INSTANTIATE(bool, checkZeroFunc,
                P99_AARG(double, C, 2));

P99_INSTANTIATE(double, dotproductFunc,
                double,
                P99_AARG(double const, A, 1),
                P99_AARG(double const, B, 1));

P99_INSTANTIATE(void*, transposeFunc,
                P99_AARG(double const, B, 2),
                void*);


P99_INSTANTIATE(void, multFunc,
                P99_AARG(double, C, 2),
                P99_AARG(double const, A, 2),
                P99_AARG(double const, B, 2));

/* Matrix multiplication has three nested for loops. One for each
   dimension and on for the dotproduct between the rows of A and the
   columns of B. We split all these loops in two. One with a longer
   step count, the values istep, jstep and kstep from above, and an
   inner one that then goes through the individual elements.

   For convenience we realize the inner three loops with step size 1
   as a macro. This will help optimization where all the three
   parameters ILEN, JLEN and KLEN correspond to compile time
   constants, namely istep, jstep and kstep, respectively.

   ILEN, JLEN and KLEN are the length of the steps in the 3
   dimensions. Generally they will be equal to istep, jstep and kstep,
   respectively, but will be less than that for the last iteration. */
#define INNER_CASE(C, A, B, ISTART, ILEN, JSTART, JLEN, KSTART, KLEN) \
/* The local type that handles a partial line. Two of these will be   \
   created below and then be used for the dot product. */             \
typedef double const kLineFrag[KLEN];                                 \
/* The local type for the partial line in the result matrix C. */     \
typedef double cLine[JLEN];                                           \
/* None of the loops are parallelized at this level. All threads      \
   should have enough work to do such that they may reuse their       \
   caches well. */                                                    \
P99_DO(size_t, i0, ISTART, ILEN) {                                    \
  register kLineFrag* AF = (kLineFrag*)&((*A)[i0][KSTART]);           \
  register cLine*restrict CL = (cLine*)&(*C)[i0][JSTART];             \
  P99_DO(size_t, j0, JSTART, JLEN) {                                  \
    register kLineFrag* BF = (kLineFrag*)&((*B)[j0][KSTART]);         \
    register double*restrict c = &(*CL)[j0-JSTART];                   \
    *c = dotproduct(AF, BF, *c);                                      \
  }                                                                   \
}

/* The implementation of the function itself. */
void
multFunc(P99_AARG(double, C, 2, nc),
         P99_AARG(double const, A, 2, na),
         P99_AARG(double const, B, 2, nb)) {
  /* check that the dimensions of the matrices fit */
  assert(nc0 == na0);
  assert(nc1 == nb1);
  assert(na1 == nb0);

  double times0 = omp_get_wtime();
  /* transpose B to have its columns consecutive in memory */
  double const P99_ARRAY(*BS, nb1, nb0) = transpose(B);
  assert(checkZero(C));
  double times1 = omp_get_wtime();

  /* The loop for the dot product cannot be easily parallelized
     because it accumulates the result. If we would want to do this,
     we would need to use reduction for the entries of C. */
  P99_DO(size_t, k, 0, na1, kstep) {
    /* The loops in the 2 dimensions can be parallelized because every
       entry of the result is computed independently. Organize a
       sufficiently large block for every individual thread such that
       it is able to reuse cached values repeatedly. For the general
       execution a thread works on

       (istep + jstep)*kstep

       elements for matrices A and B, C must not necessarily be
       cached. On these it performs istep*jstep*kstep operations, so

       istep*jstep / (istep + jstep)

       or about istep/2 operations per entry. */
    P99_PARALLEL_DO(size_t, i, 0, na0, istep) {
      P99_PARALLEL_DO(size_t, j, 0, nb1, jstep) {
        register size_t const kMax = (na1 - k);
        register size_t const iMax = (na0 - i);
        register size_t const jMax = (nb1 - j);
        if (P99_LIKELY(iMax >= istep)) {
          if (P99_LIKELY(jMax >= jstep)) {
            if (P99_LIKELY(kMax >= kstep)) {
              /* This is the principal case for which we want to
                 optimize as much as possible. */
              INNER_CASE(C, A, BS, i, istep, j, jstep, k, kstep);
            } else {
              INNER_CASE(C, A, BS, i, istep, j, jstep, k, kMax);
            }
          } else {
            if (kMax >= kstep) {
              INNER_CASE(C, A, BS, i, istep, j, jMax, k, kstep);
            } else {
              INNER_CASE(C, A, BS, i, istep, j, jMax, k, kMax);
            }
          }
        } else {
          if (jMax >= jstep) {
            if (kMax >= kstep) {
              INNER_CASE(C, A, BS, i, iMax, j, jstep, k, kstep);
            } else {
              INNER_CASE(C, A, BS, i, iMax, j, jstep, k, kMax);
            }
          } else {
            if (kMax >= kstep) {
              INNER_CASE(C, A, BS, i, iMax, j, jMax, k, kstep);
            } else {
              INNER_CASE(C, A, BS, i, iMax, j, jMax, k, kMax);
            }
          }
        }
      }
    }
  }
  double times2 = omp_get_wtime();
  printf("%s time:\t%g\n", "setup", times1 - times0);
  printf("%s time:\t%g\n", "mult", times2 - times1);
  free((void*)BS);
}

int main(int argc, char*argv[]) {
  size_t p = 1;
  if (argc > 4)
    p = strtoul(argv[4], NULL, 0);
  omp_set_num_threads(p);

  double const P99_ARRAY(A, 3, 2) = {
    {2, 3 },
    {1, 1 },
    {-1, 0}
  };
  P99_AREF(double const, AP, 3, 2) = &A;
  print(AP);
  double const P99_ARRAY(B, 2, 4) = {
    {2, 3, -1, 0},
    {1, 1, 2, 1},
  };
  P99_AREF(double const, BP, 2, 4) = &B;
  print(BP);
  dotproduct(&(*BP)[0], &(*BP)[1]);
  double P99_ARRAY(C, 3, 4) = {
    {0, 0, 0, 0},
    {0, 0, 0, 0},
    {0, 0, 0, 0},
  };
  P99_AREF(double, CP, 3, 4) = &C;
  mult(CP, AP, BP);
  print(CP);
  /* captured by the checks inside the macro: */
  // struct {double a[2];} * p; P99_ALEN(p);
  /* not captured:                            */
  // char * q; P99_ALEN(q);
  size_t n = strtoul(argv[1]);
  size_t m = strtoul(argv[2]);
  size_t k = strtoul(argv[3]);
  printf("matrix length are n=%zu, m=%zu, k=%zu\n", n, m, k);
  printf("step   length are n=%d, m=%d, k=%d\n", istep, jstep, kstep);
  P99_AREF(double, AR, n, k) = malloc(sizeof(*AR));
  P99_AREF(double, BR, k, m) = malloc(sizeof(*BR));
  /* for (size_t j = 0; j < n; ++j) */
  /*   for (size_t i = 0; i < k; ++i) */
  size_t const D[2] = {n, k};
  P99_FORALL(D, j, i)
  (*AR)[j][i] = i*j;
  /* for (size_t i = 0; i < k; ++i) */
  /*   for (size_t j = 0; j < m; ++j) */
  size_t const E[2] = {k, m};
  P99_FORALL(E, i, j)
  (*BR)[i][j] = j + i;
  P99_AREF(double, CR, n, m) = P99_CALLOC(double, n*m);
  mult(CR, AR, BR);
  free(AR);
  free(BR);
  free(CR);
}

/* follow just some arbitrary compile tests */

enum { A = 3 };

size_t A0 = P99_IPOW(0, A);
size_t A1 = P99_IPOW(1, A);
size_t A2 = P99_IPOW(2, A);
size_t A3 = P99_IPOW(3, A);
size_t A4 = P99_IPOW(4, A);

 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Defines