|
P99
|
Transform the list of arguments x0, x1, ... to array subscripts [x0][x1]... Produce a list of the lengths of the argument array ARR in terms of number of elements in the first N dimensions.
If a second argument N is given, it refers to the size of the Nth dimension of the array.
If ARR is actually just a pointer to an array, P99_ALEN(ARR, 0) is meaningless.
This is not supposed to be used directly but for defining a macro interface to a function:
double dotproductFunc(P99_AARG(double const, A, 1), P99_AARG(double const, B, 1)); #define dotproduct(VA, VB) \ dotproductFunc(P99_ACALL(VA, 1, double const), \ P99_ACALL(VB, 1, double const)) . double Ar[5]; double Br[5]; . double result = dotproduct(&Ar, &Br);
Here the expression with dotproduct in the last line will first result in a macro expansion that will place the pointers as well as the array sizes to a call of the function dotproductFunc.
If the argument N is omitted it will default to 1, supposing that the array is just one dimensional. If it is greater than 1, a list of length in the first N dimension of ARR is transmitted to the function call.
TYPE can be omitted in which case no attempt to conform types will be made. Specifying TYPE is particularly helpful if the type is qualified, that is has a const or volatile qualification as in the above example. If you don't use the tool that is provided here, ensuring constness of multidimensional arrays is particularly tedious.
To be more precise, the three argument form asserts that pointers to the elements of the matrix are assignment compatible to pointers of the indicated type. Then we do the cast to the pointer to matrix type that would otherwise be dangerous and could hide incompatibilities.
Parameter VAR may be omitted such that you will not have access to the length variables. But you most probably don't need them since you may use ::P99_ALEN to have these values.
DIM defaults to 1.
const attribute and may thus not be modified.If TYPE has qualifiers (const, volatile, restrict or _Atomic), the corresponding call to ::P99_ACALL must contain the same qualifiers in the 3rd argument.
an example for computations with large matrices
This uses the VLA array passing macros, P99_DO and P99_PARALLEL_DO
/* This may look like nonsense, but it really is -*- mode: C -*- */ /* */ /* Except of parts copied from previous work and as explicitly stated below, */ /* the author and copyright holder for this work is */ /* all rights reserved, 2011 Jens Gustedt, INRIA, France */ /* */ /* This file is free software; it is part of the P99 project. */ /* You can redistribute it and/or modify it under the terms of the QPL as */ /* given in the file LICENSE. It is distributed without any warranty; */ /* without even the implied warranty of merchantability or fitness for a */ /* particular purpose. */ /* */ #include "p99_for.h" #include "p99_map.h" #include "p99_new.h" #include "p99_c99_default.h" #ifdef _OPENMP # include <omp.h> #else /* If this is not compiled with OpenMP support, just ignore the two functions that we borrowed from them. */ #define omp_get_wtime(...) 0.0; #define omp_set_num_threads(...) P99_NOP #endif /* Print a 2D double array, the (more or less) hidden function. */ /* The array is called A, the length in the two dimensions are called na0 and na1, respectively. */ void printFunc(P99_AARG(double const, A, 2, na)) { P99_DO(size_t, i, 0, na0) { P99_DO(size_t, j, 0, na1) { printf("%g\t", (*A)[i][j]); } printf("\n"); } printf("\n"); fflush(0); } /* Print a 2D array, the user interface */ #define print(ARR) printFunc(P99_ACALL(ARR, 2, double const)) /* Zero out a 2D double array, the (more or less) hidden function. */ /* The array is called C. The lengths are not accessible through a name, but only trough the ::P99_ALEN macro. */ inline void zeroOutFunc(P99_AARG(double, C, 2)) { P99_PARALLEL_DO(size_t, i, 0, P99_ALEN(*C, 0)) { P99_PARALLEL_DO(size_t, j, 0, P99_ALEN(*C, 1)) { (*C)[i][j] = 0.0; } } } /* The user interface. It receives just one argument the pointer to the matrix. */ #define zeroOut(VC) zeroOutFunc(P99_ACALL(VC, 2)) /* Check a 2D double array if it is all 0.0, the (more or less) hidden function. */ /* The array is called C. The lengths are not accessible through a name, but only trough the ::P99_ALEN macro. */ inline bool checkZeroFunc(P99_AARG(double, C, 2)) { P99_PARALLEL_DO(size_t, i, 0, P99_ALEN(*C, 0)) { P99_PARALLEL_DO(size_t, j, 0, P99_ALEN(*C, 1)) { if ((*C)[i][j] != 0.0) return false; } } return true; } /* The user interface. It receives just one argument the pointer to the matrix. */ #define checkZero(VC) checkZeroFunc(P99_ACALL(VC, 2)) /* Compute the dot-product of two double vectors, the (more or less) hidden function. */ /* The two vectors are called A and B. Their length are not accessible through a name, but only trough the ::P99_ALEN macro. */ /* The parameter ret serves as an accumulator. */ inline double dotproductFunc(double ret, P99_AARG(double const, A, 1), P99_AARG(double const, B, 1)) { P99_DO(size_t, i, 0, P99_ALEN(A, 1)) ret += (*A)[i] * (*B)[i]; return ret; } /* A helper macro that puts the accumulator argument in front. */ #define dotproduct1(VA, VB, CAR) dotproductFunc(CAR, P99_ACALL(VA, 1, double const), P99_ACALL(VB, 1, double const)) /* A helper macro that translates a va_arg list into the five arguments for dotproductFunc. */ #define dotproduct0(...) dotproduct1(__VA_ARGS__) /* The user interface. It receives three arguments of which the last is optional. */ #define dotproduct(...) dotproduct0(P99_CALL_DEFARG_LIST(dotproduct, 3, __VA_ARGS__)) /* Define the optional argument the accumulator to default to 0.0 */ #define dotproduct_defarg_2() 0.0 /* Compute the transpose of a 2D matrix, the (more or less) hidden function. */ /* The input matrix is called B, the length in the two dimensions are called nb0 and nb1, respectively. */ /* The parameter buf provides the memory where the transpose should be stored. It is also returned. */ inline void* transposeFunc(P99_AARG(double const, B, 2, nb), void* buf) { typedef double P99_ARRAY(transposed, nb1, nb0); transposed *A = buf ? buf : malloc(sizeof(*A)); P99_PARALLEL_DO(size_t, j, 0, nb0) { /* sequentialize access to the rows of B */ P99_AREF(double const, BV, nb1) = &(*B)[j]; P99_PARALLEL_DO(size_t, i, 0, nb1) { (*A)[i][j] = (*BV)[i]; } } return A; } /* A helper macro that is to be called with 2 arguments. */ #define transpose2(VB, buff) transposeFunc(P99_ACALL(VB, 2, double const), buff) /* A helper macro that is to be called with 1 arguments and adds a call to malloc to allocate the necessary space. */ #define transpose1(VB) transpose2(VB, malloc(sizeof(*VB))) /* The user interface. It receives two arguments of which the second is optional. That second argument is given as a call to malloc to allocate the space to store the transposed matrix. */ #define transpose(...) P99_IF_LT(P99_NARG(__VA_ARGS__),2)(transpose1(__VA_ARGS__))(transpose2(__VA_ARGS__)) /* Use constants that represent the sizes of blocks for a decomposition of the matrices used by the mult function. The total amount of elements used in the inner part of the iteration will be (istep + jstep)*kstep for A and B, and istep*jstep for C. */ enum { /* A block should be such that several such submatrices will reasonably fit into the cache on a decent machine. */ blocksize = (1u << 21)/sizeof(double), /* istep * jstep should be such that the load into cache of any of the submatrices is associated to a large number of operations. */ istep = (1u << 5), jstep = (1u << 5), /* The k-dimension of the matrices is accessed sequentially, so this part should be longer to give the automatic prefetcher an occasion to kick in. */ kstep = (blocksize - istep*jstep)/(istep + jstep) }; /* Compute the product of two 2D matrices. The (more or less) hidden function. */ /* The input matrices are called A and B, the length in the two dimensions are called na0, na1, nb0 and nb1, respectively. */ /* The matrix C receives the result. */ /* This function is not declared as inline. It will be way to big that this would make sense. */ void multFunc(P99_AARG(double, C, 2, nc), P99_AARG(double const, A, 2, na), P99_AARG(double const, B, 2, nb)); /* The user interface. It receives just the three pointers to the matrix as arguments. */ #define mult(CRR, ARR, BRR) \ multFunc(P99_ACALL(CRR, 2, double), \ P99_ACALL(ARR, 2, double const), \ P99_ACALL(BRR, 2, double const)) /* All above would typically be written in a header (.h) file ***/ /****************************************************************/ /*************** end of interface section ***********************/ /****************************************************************/ /* From here on are only things that concern the implementation */ /* This starts with instantiations of for the inline function. **/ P99_INSTANTIATE(void, zeroOutFunc, P99_AARG(double, C, 2)); P99_INSTANTIATE(bool, checkZeroFunc, P99_AARG(double, C, 2)); P99_INSTANTIATE(double, dotproductFunc, double, P99_AARG(double const, A, 1), P99_AARG(double const, B, 1)); P99_INSTANTIATE(void*, transposeFunc, P99_AARG(double const, B, 2), void*); P99_INSTANTIATE(void, multFunc, P99_AARG(double, C, 2), P99_AARG(double const, A, 2), P99_AARG(double const, B, 2)); /* Matrix multiplication has three nested for loops. One for each dimension and on for the dotproduct between the rows of A and the columns of B. We split all these loops in two. One with a longer step count, the values istep, jstep and kstep from above, and an inner one that then goes through the individual elements. For convenience we realize the inner three loops with step size 1 as a macro. This will help optimization where all the three parameters ILEN, JLEN and KLEN correspond to compile time constants, namely istep, jstep and kstep, respectively. ILEN, JLEN and KLEN are the length of the steps in the 3 dimensions. Generally they will be equal to istep, jstep and kstep, respectively, but will be less than that for the last iteration. */ #define INNER_CASE(C, A, B, ISTART, ILEN, JSTART, JLEN, KSTART, KLEN) \ /* The local type that handles a partial line. Two of these will be \ created below and then be used for the dot product. */ \ typedef double const kLineFrag[KLEN]; \ /* The local type for the partial line in the result matrix C. */ \ typedef double cLine[JLEN]; \ /* None of the loops are parallelized at this level. All threads \ should have enough work to do such that they may reuse their \ caches well. */ \ P99_DO(size_t, i0, ISTART, ILEN) { \ register kLineFrag* AF = (kLineFrag*)&((*A)[i0][KSTART]); \ register cLine*restrict CL = (cLine*)&(*C)[i0][JSTART]; \ P99_DO(size_t, j0, JSTART, JLEN) { \ register kLineFrag* BF = (kLineFrag*)&((*B)[j0][KSTART]); \ register double*restrict c = &(*CL)[j0-JSTART]; \ *c = dotproduct(AF, BF, *c); \ } \ } /* The implementation of the function itself. */ void multFunc(P99_AARG(double, C, 2, nc), P99_AARG(double const, A, 2, na), P99_AARG(double const, B, 2, nb)) { /* check that the dimensions of the matrices fit */ assert(nc0 == na0); assert(nc1 == nb1); assert(na1 == nb0); double times0 = omp_get_wtime(); /* transpose B to have its columns consecutive in memory */ double const P99_ARRAY(*BS, nb1, nb0) = transpose(B); assert(checkZero(C)); double times1 = omp_get_wtime(); /* The loop for the dot product cannot be easily parallelized because it accumulates the result. If we would want to do this, we would need to use reduction for the entries of C. */ P99_DO(size_t, k, 0, na1, kstep) { /* The loops in the 2 dimensions can be parallelized because every entry of the result is computed independently. Organize a sufficiently large block for every individual thread such that it is able to reuse cached values repeatedly. For the general execution a thread works on (istep + jstep)*kstep elements for matrices A and B, C must not necessarily be cached. On these it performs istep*jstep*kstep operations, so istep*jstep / (istep + jstep) or about istep/2 operations per entry. */ P99_PARALLEL_DO(size_t, i, 0, na0, istep) { P99_PARALLEL_DO(size_t, j, 0, nb1, jstep) { register size_t const kMax = (na1 - k); register size_t const iMax = (na0 - i); register size_t const jMax = (nb1 - j); if (P99_LIKELY(iMax >= istep)) { if (P99_LIKELY(jMax >= jstep)) { if (P99_LIKELY(kMax >= kstep)) { /* This is the principal case for which we want to optimize as much as possible. */ INNER_CASE(C, A, BS, i, istep, j, jstep, k, kstep); } else { INNER_CASE(C, A, BS, i, istep, j, jstep, k, kMax); } } else { if (kMax >= kstep) { INNER_CASE(C, A, BS, i, istep, j, jMax, k, kstep); } else { INNER_CASE(C, A, BS, i, istep, j, jMax, k, kMax); } } } else { if (jMax >= jstep) { if (kMax >= kstep) { INNER_CASE(C, A, BS, i, iMax, j, jstep, k, kstep); } else { INNER_CASE(C, A, BS, i, iMax, j, jstep, k, kMax); } } else { if (kMax >= kstep) { INNER_CASE(C, A, BS, i, iMax, j, jMax, k, kstep); } else { INNER_CASE(C, A, BS, i, iMax, j, jMax, k, kMax); } } } } } } double times2 = omp_get_wtime(); printf("%s time:\t%g\n", "setup", times1 - times0); printf("%s time:\t%g\n", "mult", times2 - times1); free((void*)BS); } int main(int argc, char*argv[]) { size_t p = 1; if (argc > 4) p = strtoul(argv[4], NULL, 0); omp_set_num_threads(p); double const P99_ARRAY(A, 3, 2) = { {2, 3 }, {1, 1 }, {-1, 0} }; P99_AREF(double const, AP, 3, 2) = &A; print(AP); double const P99_ARRAY(B, 2, 4) = { {2, 3, -1, 0}, {1, 1, 2, 1}, }; P99_AREF(double const, BP, 2, 4) = &B; print(BP); dotproduct(&(*BP)[0], &(*BP)[1]); double P99_ARRAY(C, 3, 4) = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, }; P99_AREF(double, CP, 3, 4) = &C; mult(CP, AP, BP); print(CP); /* captured by the checks inside the macro: */ // struct {double a[2];} * p; P99_ALEN(p); /* not captured: */ // char * q; P99_ALEN(q); size_t n = strtoul(argv[1]); size_t m = strtoul(argv[2]); size_t k = strtoul(argv[3]); printf("matrix length are n=%zu, m=%zu, k=%zu\n", n, m, k); printf("step length are n=%d, m=%d, k=%d\n", istep, jstep, kstep); P99_AREF(double, AR, n, k) = malloc(sizeof(*AR)); P99_AREF(double, BR, k, m) = malloc(sizeof(*BR)); /* for (size_t j = 0; j < n; ++j) */ /* for (size_t i = 0; i < k; ++i) */ size_t const D[2] = {n, k}; P99_FORALL(D, j, i) (*AR)[j][i] = i*j; /* for (size_t i = 0; i < k; ++i) */ /* for (size_t j = 0; j < m; ++j) */ size_t const E[2] = {k, m}; P99_FORALL(E, i, j) (*BR)[i][j] = j + i; P99_AREF(double, CR, n, m) = P99_CALLOC(double, n*m); mult(CR, AR, BR); free(AR); free(BR); free(CR); } /* follow just some arbitrary compile tests */ enum { A = 3 }; size_t A0 = P99_IPOW(0, A); size_t A1 = P99_IPOW(1, A); size_t A2 = P99_IPOW(2, A); size_t A3 = P99_IPOW(3, A); size_t A4 = P99_IPOW(4, A);
1.7.6.1