Skip to content

Commit a1a5a9b

Browse files
authored
Implemented support for fat multithreading. (#665)
Details: - Allow the user to configure BLIS in such a way that multiple threading implementations get compiled into the library, with one of those implementations chosen at runtime. For now, there are only three implementations available: OpenMP, pthreads, and single. (Here, 'single' merely refers to single-threaded mode.) The configure script now allows the user to give the -t option with a comma-separated list of values, such as '-t openmp,pthreads'. The first value in the list will always be the default at library initialization time, and 'single' is always silently appended to the end of the list. The user can specify which implementation should execute in one of three ways: by setting the BLIS_THREAD_IMPL environment variable prior to launch; by calling the bli_thread_set_thread_impl() global runtime API; or by encoding their choice into a rntm_t that is passed into one of the expert interfaces. Any of these three choices overrides the initialization-time default (i.e., the first value listed to the -t configure option). Requesting an implementation that was not compiled into the library will result in an error message followed by bli_abort(). - Relocated the 'auto' logic for the -t option from the top-level Makefile to the configure script. (Currently, this logic is pretty dumb, choosing 'openmp' for gcc and icc, and 'pthreads' for clang.) - Defined a new 'timpl_t' enum in bli_type_defs.h, with three valid values: BLIS_SINGLE, BLIS_OPENMP, BLIS_POSIX. - Reorganized the thrcomm_t struct into a single defintion with two preprocessor blocks, one each for additional fields needed by OpenMP and pthreads. - Added timpl_t argument to bli_thrcomm_bcast(), bli_thrcomm_barrier(), bli_thrcomm_init(), and bli_thrcomm_cleanup(), which these functions need since they are now wrappers that choose the implementation- specific function corresponding to the currently enabled threading implementation. - Added rntm_t* to bli_thread_broadcast(), bli_thread_barrier() so that those functions can pass the timpl_t value into bli_thrcomm_bcast() and bli_thrcomm_barrier(), respectively. - Defined bli_env_get_str() in bli_env.c to allow the querying of BLIS_THREAD_IMPL (which, unlike BLIS_NUM_THREADS and friends, is expected to be a string). - Defined bli_thread_get_thread_impl(), bli_thread_set_thread_impl() to get and set the current threading implementation at runtime. - Defined bli_rntm_thread_impl() and bli_rntm_set_thread_impl() to query and set the threading implementation within a rntm_t. Also choose BLIS_SINGLE as the default value when initializing rntm_t structs. - Added bli_info_get_*() functions to query whether OpenMP or pthreads would be chosen as the default at init-time. Note that this only tests whether OpenMP or pthreads is the first implementation in the list passed to the threading configure option (-t) and is *not* the same as querying which implementation is currently selected, since that can be influenced by BLIS_THREAD_IMPL and/or bli_thread_set_thread_impl(). - Changed l3int_t to l3int_ft. - Updated docs/Multithreading.md to document the new behavior. - Updated sandbox/gemmlike and addon/gemmd to work with the new fat threading feature. This included a few bugfixes to bring the codes up to date, as necessary. - Comment, whitespace updates.
1 parent 89df7b8 commit a1a5a9b

File tree

83 files changed

+2083
-717
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+2083
-717
lines changed

Makefile

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,24 +1149,24 @@ endif # ifeq ($(IS_WIN),no)
11491149
# --- Query current configuration ---
11501150

11511151
showconfig: check-env
1152-
@echo "configuration family: $(CONFIG_NAME)"
1153-
@echo "sub-configurations: $(CONFIG_LIST)"
1154-
@echo "requisite kernels sets: $(KERNEL_LIST)"
1155-
@echo "kernel-to-config map: $(KCONFIG_MAP)"
1152+
@echo "configuration family: $(CONFIG_NAME)"
1153+
@echo "sub-configurations: $(CONFIG_LIST)"
1154+
@echo "requisite kernels sets: $(KERNEL_LIST)"
1155+
@echo "kernel-to-config map: $(KCONFIG_MAP)"
11561156
@echo "-------------------------"
1157-
@echo "BLIS version string: $(VERSION)"
1158-
@echo ".so major version: $(SO_MAJOR)"
1159-
@echo ".so minor.build vers: $(SO_MINORB)"
1160-
@echo "install libdir: $(INSTALL_LIBDIR)"
1161-
@echo "install includedir: $(INSTALL_INCDIR)"
1162-
@echo "install sharedir: $(INSTALL_SHAREDIR)"
1163-
@echo "debugging status: $(DEBUG_TYPE)"
1164-
@echo "multithreading status: $(THREADING_MODEL)"
1165-
@echo "enable BLAS API? $(MK_ENABLE_BLAS)"
1166-
@echo "enable CBLAS API? $(MK_ENABLE_CBLAS)"
1167-
@echo "build static library? $(MK_ENABLE_STATIC)"
1168-
@echo "build shared library? $(MK_ENABLE_SHARED)"
1169-
@echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)"
1157+
@echo "BLIS version string: $(VERSION)"
1158+
@echo ".so major version: $(SO_MAJOR)"
1159+
@echo ".so minor.build vers: $(SO_MINORB)"
1160+
@echo "install libdir: $(INSTALL_LIBDIR)"
1161+
@echo "install includedir: $(INSTALL_INCDIR)"
1162+
@echo "install sharedir: $(INSTALL_SHAREDIR)"
1163+
@echo "debugging status: $(DEBUG_TYPE)"
1164+
@echo "enabled threading model(s): $(THREADING_MODEL)"
1165+
@echo "enable BLAS API? $(MK_ENABLE_BLAS)"
1166+
@echo "enable CBLAS API? $(MK_ENABLE_CBLAS)"
1167+
@echo "build static library? $(MK_ENABLE_STATIC)"
1168+
@echo "build shared library? $(MK_ENABLE_SHARED)"
1169+
@echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)"
11701170

11711171

11721172
# --- Clean rules ---

addon/gemmd/attic/bli_gemm_ex.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@
3636

3737
void bli_gemm_ex
3838
(
39-
obj_t* alpha,
40-
obj_t* a,
41-
obj_t* b,
42-
obj_t* beta,
43-
obj_t* c,
44-
cntx_t* cntx,
45-
rntm_t* rntm
39+
const obj_t* alpha,
40+
const obj_t* a,
41+
const obj_t* b,
42+
const obj_t* beta,
43+
const obj_t* c,
44+
const cntx_t* cntx,
45+
rntm_t* rntm
4646
)
4747
{
4848
bli_init_once();
@@ -82,7 +82,8 @@ void bli_gemm_ex
8282
// Invoke the operation's front end.
8383
bli_gemm_front
8484
(
85-
alpha, a, b, beta, c, cntx, rntm, NULL
85+
( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c,
86+
( cntx_t* )cntx, ( rntm_t* )rntm, NULL
8687
);
8788
}
8889

addon/gemmd/bao_gemmd.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,16 +81,28 @@ void bao_gemmd_ex
8181
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
8282
else { rntm_l = *rntm; rntm = &rntm_l; }
8383

84+
// Set the .pack_a and .pack_b fields to TRUE. This is only needed because
85+
// this addon uses bli_thrinfo_sup_grow(), which calls
86+
// bli_thrinfo_sup_create_for_cntl(), which employs an optimization if
87+
// both fields are FALSE (as is often the case with sup). However, this
88+
// addon implements the "large" code path, and so both A and B must
89+
// always be packed. Setting the fields to TRUE will avoid the optimization
90+
// while this addon implementation executes (and it also reinforces the
91+
// fact that we *are* indeed packing A and B, albeit not in the sup context
92+
// originally envisioned for the .pack_a and .pack_b fields).
93+
bli_rntm_set_pack_a( TRUE, rntm );
94+
bli_rntm_set_pack_b( TRUE, rntm );
95+
8496
// Obtain a valid (native) context from the gks if necessary.
8597
// NOTE: This must be done before calling the _check() function, since
8698
// that function assumes the context pointer is valid.
87-
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
99+
if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx();
88100

89101
// Check parameters.
90102
if ( bli_error_checking_is_enabled() )
91103
bao_gemmd_check( alpha, a, d, b, beta, c, cntx );
92104

93-
// -- bli_gemmd_front() ----------------------------------------------------
105+
// -- bao_gemmd_front() ----------------------------------------------------
94106

95107
obj_t a_local;
96108
obj_t b_local;

addon/gemmd/bao_gemmd_bp_var1.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ void PASTECH2(bao_,ch,varname) \
458458
/* This barrier is needed to prevent threads from starting to pack
459459
the next row panel of B before the current row panel is fully
460460
computed upon. */ \
461-
bli_thread_barrier( thread_pb ); \
461+
bli_thread_barrier( rntm, thread_pb ); \
462462
} \
463463
} \
464464
\

addon/gemmd/bao_l3_packm_a.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
6161
\
6262
/* Barrier to make sure all threads are caught up and ready to begin the
6363
packm stage. */ \
64-
bli_thread_barrier( thread ); \
64+
bli_thread_barrier( rntm, thread ); \
6565
\
6666
/* Compute the size of the memory block eneded. */ \
6767
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
9090
\
9191
/* Broadcast the address of the chief thread's passed-in mem_t to all
9292
threads. */ \
93-
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
93+
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
9494
\
9595
/* Non-chief threads: Copy the contents of the chief thread's
9696
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
139139
\
140140
/* Broadcast the address of the chief thread's passed-in mem_t
141141
to all threads. */ \
142-
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
142+
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
143143
\
144144
/* Non-chief threads: Copy the contents of the chief thread's
145145
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \
319319
); \
320320
\
321321
/* Barrier so that packing is done before computation. */ \
322-
bli_thread_barrier( thread ); \
322+
bli_thread_barrier( rntm, thread ); \
323323
}
324324

325325
//INSERT_GENTFUNC_BASIC0( packm_a )

addon/gemmd/bao_l3_packm_b.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
6161
\
6262
/* Barrier to make sure all threads are caught up and ready to begin the
6363
packm stage. */ \
64-
bli_thread_barrier( thread ); \
64+
bli_thread_barrier( rntm, thread ); \
6565
\
6666
/* Compute the size of the memory block eneded. */ \
6767
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
9090
\
9191
/* Broadcast the address of the chief thread's passed-in mem_t to all
9292
threads. */ \
93-
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
93+
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
9494
\
9595
/* Non-chief threads: Copy the contents of the chief thread's
9696
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
139139
\
140140
/* Broadcast the address of the chief thread's passed-in mem_t
141141
to all threads. */ \
142-
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
142+
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
143143
\
144144
/* Non-chief threads: Copy the contents of the chief thread's
145145
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \
319319
); \
320320
\
321321
/* Barrier so that packing is done before computation. */ \
322-
bli_thread_barrier( thread ); \
322+
bli_thread_barrier( rntm, thread ); \
323323
}
324324

325325
//INSERT_GENTFUNC_BASIC0( packm_b )

addon/gemmd/thread/bao_l3_decor.c

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/*
2+
3+
BLIS
4+
An object-based framework for developing high-performance BLAS-like
5+
libraries.
6+
7+
Copyright (C) 2022, The University of Texas at Austin
8+
9+
Redistribution and use in source and binary forms, with or without
10+
modification, are permitted provided that the following conditions are
11+
met:
12+
- Redistributions of source code must retain the above copyright
13+
notice, this list of conditions and the following disclaimer.
14+
- Redistributions in binary form must reproduce the above copyright
15+
notice, this list of conditions and the following disclaimer in the
16+
documentation and/or other materials provided with the distribution.
17+
- Neither the name(s) of the copyright holder(s) nor the names of its
18+
contributors may be used to endorse or promote products derived
19+
from this software without specific prior written permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32+
33+
*/
34+
35+
#include "blis.h"
36+
37+
// Initialize a function pointer array containing function addresses for
38+
// each of the threading-specific level-3 thread decorators.
39+
40+
static l3ao_decor_ft l3ao_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] =
41+
{
42+
[BLIS_SINGLE] = bao_l3_thread_decorator_single,
43+
[BLIS_OPENMP] =
44+
#if defined(BLIS_ENABLE_OPENMP)
45+
bao_l3_thread_decorator_openmp,
46+
#elif defined(BLIS_ENABLE_PTHREADS)
47+
NULL,
48+
#else
49+
NULL,
50+
#endif
51+
[BLIS_POSIX] =
52+
#if defined(BLIS_ENABLE_PTHREADS)
53+
bao_l3_thread_decorator_pthreads,
54+
#elif defined(BLIS_ENABLE_OPENMP)
55+
NULL,
56+
#else
57+
NULL,
58+
#endif
59+
};
60+
61+
// Define a dispatcher that chooses a threading-specific function from the
62+
// above function pointer array.
63+
64+
void bao_l3_thread_decorator
65+
(
66+
l3aoint_ft func,
67+
opid_t family,
68+
obj_t* alpha,
69+
obj_t* a,
70+
obj_t* d,
71+
obj_t* b,
72+
obj_t* beta,
73+
obj_t* c,
74+
cntx_t* cntx,
75+
rntm_t* rntm
76+
)
77+
{
78+
rntm_t rntm_l;
79+
80+
// Query the threading implementation and the number of threads requested.
81+
timpl_t ti = bli_rntm_thread_impl( rntm );
82+
dim_t nt = bli_rntm_num_threads( rntm );
83+
84+
if ( bli_error_checking_is_enabled() )
85+
bao_l3_thread_decorator_check( rntm );
86+
87+
if ( 1 < nt && ti == BLIS_SINGLE )
88+
{
89+
// Here, we resolve conflicting information. The caller requested
90+
// a sequential threading implementation, but also requested more
91+
// than one thread. Here, we choose to favor the requested threading
92+
// implementation over the number of threads, and so reset all
93+
// parallelism parameters to 1.
94+
rntm_l = *rntm;
95+
nt = 1;
96+
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l );
97+
bli_rntm_set_num_threads_only( 1, &rntm_l );
98+
rntm = &rntm_l;
99+
}
100+
101+
// Use the timpl_t value to index into the corresponding function address
102+
// from the function pointer array.
103+
const l3ao_decor_ft fp = l3ao_decor_fpa[ ti ];
104+
105+
// Call the threading-specific decorator function.
106+
fp
107+
(
108+
func,
109+
family,
110+
alpha,
111+
a,
112+
d,
113+
b,
114+
beta,
115+
c,
116+
cntx,
117+
rntm
118+
);
119+
}
120+
121+
void bao_l3_thread_decorator_check
122+
(
123+
rntm_t* rntm
124+
)
125+
{
126+
//err_t e_val;
127+
128+
//e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) );
129+
//bli_check_error_code( e_val );
130+
131+
const timpl_t ti = bli_rntm_thread_impl( rntm );
132+
133+
if (
134+
#ifndef BLIS_ENABLE_OPENMP
135+
ti == BLIS_OPENMP ||
136+
#endif
137+
#ifndef BLIS_ENABLE_PTHREADS
138+
ti == BLIS_POSIX ||
139+
#endif
140+
FALSE
141+
)
142+
{
143+
fprintf( stderr, "\n" );
144+
fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
145+
fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) );
146+
fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ );
147+
bli_abort();
148+
}
149+
}
150+

0 commit comments

Comments
 (0)