LLVM OpenMP* Runtime Library
kmp_dispatch.h
1 /*
2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_DISPATCH_H
14 #define KMP_DISPATCH_H
15 
16 /* ------------------------------------------------------------------------ */
17 /* ------------------------------------------------------------------------ */
18 
19 #include "kmp.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_itt.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #if KMP_OS_WINDOWS && KMP_ARCH_X86
26 #include <float.h>
27 #endif
28 
29 #if OMPT_SUPPORT
30 #include "ompt-internal.h"
31 #include "ompt-specific.h"
32 #endif
33 
34 /* ------------------------------------------------------------------------ */
35 /* ------------------------------------------------------------------------ */
36 #if KMP_USE_HIER_SCHED
37 // Forward declarations of some hierarchical scheduling data structures
38 template <typename T> struct kmp_hier_t;
39 template <typename T> struct kmp_hier_top_unit_t;
40 #endif // KMP_USE_HIER_SCHED
41 
42 template <typename T> struct dispatch_shared_info_template;
43 template <typename T> struct dispatch_private_info_template;
44 
45 template <typename T>
46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47  dispatch_private_info_template<T> *pr,
48  enum sched_type schedule, T lb, T ub,
49  typename traits_t<T>::signed_t st,
50 #if USE_ITT_BUILD
51  kmp_uint64 *cur_chunk,
52 #endif
53  typename traits_t<T>::signed_t chunk,
54  T nproc, T unit_id);
55 template <typename T>
56 extern int __kmp_dispatch_next_algorithm(
57  int gtid, dispatch_private_info_template<T> *pr,
58  dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59  T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60 
61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63 
64 #if KMP_STATIC_STEAL_ENABLED
65 
66 // replaces dispatch_private_info{32,64} structures and
67 // dispatch_private_info{32,64}_t types
68 template <typename T> struct dispatch_private_infoXX_template {
69  typedef typename traits_t<T>::unsigned_t UT;
70  typedef typename traits_t<T>::signed_t ST;
71  UT count; // unsigned
72  T ub;
73  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74  T lb;
75  ST st; // signed
76  UT tc; // unsigned
77  T static_steal_counter; // for static_steal only; maybe better to put after ub
78 
79  /* parm[1-4] are used in different ways by different scheduling algorithms */
80 
81  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
82  // a) parm3 is properly aligned and
83  // b) all parm1-4 are in the same cache line.
84  // Because of parm1-4 are used together, performance seems to be better
85  // if they are in the same line (not measured though).
86 
87  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
88  T parm1;
89  T parm2;
90  T parm3;
91  T parm4;
92  };
93 
94  UT ordered_lower; // unsigned
95  UT ordered_upper; // unsigned
96 #if KMP_OS_WINDOWS
97  T last_upper;
98 #endif /* KMP_OS_WINDOWS */
99 };
100 
101 #else /* KMP_STATIC_STEAL_ENABLED */
102 
103 // replaces dispatch_private_info{32,64} structures and
104 // dispatch_private_info{32,64}_t types
105 template <typename T> struct dispatch_private_infoXX_template {
106  typedef typename traits_t<T>::unsigned_t UT;
107  typedef typename traits_t<T>::signed_t ST;
108  T lb;
109  T ub;
110  ST st; // signed
111  UT tc; // unsigned
112 
113  T parm1;
114  T parm2;
115  T parm3;
116  T parm4;
117 
118  UT count; // unsigned
119 
120  UT ordered_lower; // unsigned
121  UT ordered_upper; // unsigned
122 #if KMP_OS_WINDOWS
123  T last_upper;
124 #endif /* KMP_OS_WINDOWS */
125 };
126 #endif /* KMP_STATIC_STEAL_ENABLED */
127 
128 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
129  // duplicate alignment here, otherwise size of structure is not correct in our
130  // compiler
131  union KMP_ALIGN_CACHE private_info_tmpl {
132  dispatch_private_infoXX_template<T> p;
133  dispatch_private_info64_t p64;
134  } u;
135  enum sched_type schedule; /* scheduling algorithm */
136  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
137  kmp_uint32 ordered_bumped;
138  // to retain the structure size after making order
139  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
140  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
141  kmp_uint32 type_size;
142 #if KMP_USE_HIER_SCHED
143  kmp_int32 hier_id;
144  kmp_hier_top_unit_t<T> *hier_parent;
145  // member functions
146  kmp_int32 get_hier_id() const { return hier_id; }
147  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
148 #endif
149  enum cons_type pushed_ws;
150 };
151 
152 // replaces dispatch_shared_info{32,64} structures and
153 // dispatch_shared_info{32,64}_t types
154 template <typename T> struct dispatch_shared_infoXX_template {
155  typedef typename traits_t<T>::unsigned_t UT;
156  /* chunk index under dynamic, number of idle threads under static-steal;
157  iteration index otherwise */
158  volatile UT iteration;
159  volatile UT num_done;
160  volatile UT ordered_iteration;
161  // to retain the structure size making ordered_iteration scalar
162  UT ordered_dummy[KMP_MAX_ORDERED - 3];
163 };
164 
165 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
166 template <typename T> struct dispatch_shared_info_template {
167  typedef typename traits_t<T>::unsigned_t UT;
168  // we need union here to keep the structure size
169  union shared_info_tmpl {
170  dispatch_shared_infoXX_template<UT> s;
171  dispatch_shared_info64_t s64;
172  } u;
173  volatile kmp_uint32 buffer_index;
174 #if OMP_45_ENABLED
175  volatile kmp_int32 doacross_buf_idx; // teamwise index
176  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
177  kmp_int32 doacross_num_done; // count finished threads
178 #endif
179 #if KMP_USE_HIER_SCHED
180  kmp_hier_t<T> *hier;
181 #endif
182 #if KMP_USE_HWLOC
183  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
184  // machines (> 48 cores). Performance analysis showed that a cache thrash
185  // was occurring and this padding helps alleviate the problem.
186  char padding[64];
187 #endif
188 };
189 
190 /* ------------------------------------------------------------------------ */
191 /* ------------------------------------------------------------------------ */
192 
193 #undef USE_TEST_LOCKS
194 
195 // test_then_add template (general template should NOT be used)
196 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
197 
198 template <>
199 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
200  kmp_int32 d) {
201  kmp_int32 r;
202  r = KMP_TEST_THEN_ADD32(p, d);
203  return r;
204 }
205 
206 template <>
207 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
208  kmp_int64 d) {
209  kmp_int64 r;
210  r = KMP_TEST_THEN_ADD64(p, d);
211  return r;
212 }
213 
214 // test_then_inc_acq template (general template should NOT be used)
215 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
216 
217 template <>
218 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
219  kmp_int32 r;
220  r = KMP_TEST_THEN_INC_ACQ32(p);
221  return r;
222 }
223 
224 template <>
225 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
226  kmp_int64 r;
227  r = KMP_TEST_THEN_INC_ACQ64(p);
228  return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template <typename T> static __forceinline T test_then_inc(volatile T *p);
233 
234 template <>
235 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
236  kmp_int32 r;
237  r = KMP_TEST_THEN_INC32(p);
238  return r;
239 }
240 
241 template <>
242 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
243  kmp_int64 r;
244  r = KMP_TEST_THEN_INC64(p);
245  return r;
246 }
247 
248 // compare_and_swap template (general template should NOT be used)
249 template <typename T>
250 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
251 
252 template <>
253 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
254  kmp_int32 c, kmp_int32 s) {
255  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
256 }
257 
258 template <>
259 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
260  kmp_int64 c, kmp_int64 s) {
261  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
262 }
263 
264 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
265  return value >= checker;
266 }
267 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
268  return value == checker;
269 }
270 
271 /*
272  Spin wait loop that pauses between checks.
273  Waits until function returns non-zero when called with *spinner and check.
274  Does NOT put threads to sleep.
275  Arguments:
276  UT is unsigned 4- or 8-byte type
277  spinner - memory location to check value
278  checker - value which spinner is >, <, ==, etc.
279  pred - predicate function to perform binary comparison of some sort
280 #if USE_ITT_BUILD
281  obj -- is higher-level synchronization object to report to ittnotify. It
282  is used to report locks consistently. For example, if lock is acquired
283  immediately, its address is reported to ittnotify via
284  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
285  and lock routine calls to KMP_WAIT(), the later should report the
286  same address, not an address of low-level spinner.
287 #endif // USE_ITT_BUILD
288  TODO: make inline function (move to header file for icl)
289 */
290 template <typename UT>
291 static UT __kmp_wait(volatile UT *spinner, UT checker,
292  kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
293  // note: we may not belong to a team at this point
294  volatile UT *spin = spinner;
295  UT check = checker;
296  kmp_uint32 spins;
297  kmp_uint32 (*f)(UT, UT) = pred;
298  UT r;
299 
300  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
301  KMP_INIT_YIELD(spins);
302  // main wait spin loop
303  while (!f(r = *spin, check)) {
304  KMP_FSYNC_SPIN_PREPARE(obj);
305  /* GEH - remove this since it was accidentally introduced when kmp_wait was
306  split.
307  It causes problems with infinite recursion because of exit lock */
308  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
309  __kmp_abort_thread(); */
310  // If oversubscribed, or have waited a bit then yield.
311  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
312  }
313  KMP_FSYNC_SPIN_ACQUIRED(obj);
314  return r;
315 }
316 
317 /* ------------------------------------------------------------------------ */
318 /* ------------------------------------------------------------------------ */
319 
320 template <typename UT>
321 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
322  dispatch_private_info_template<UT> *pr;
323 
324  int gtid = *gtid_ref;
325  // int cid = *cid_ref;
326  kmp_info_t *th = __kmp_threads[gtid];
327  KMP_DEBUG_ASSERT(th->th.th_dispatch);
328 
329  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
330  if (__kmp_env_consistency_check) {
331  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
332  th->th.th_dispatch->th_dispatch_pr_current);
333  if (pr->pushed_ws != ct_none) {
334 #if KMP_USE_DYNAMIC_LOCK
335  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
336 #else
337  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
338 #endif
339  }
340  }
341 
342  if (!th->th.th_team->t.t_serialized) {
343  dispatch_shared_info_template<UT> *sh =
344  reinterpret_cast<dispatch_shared_info_template<UT> *>(
345  th->th.th_dispatch->th_dispatch_sh_current);
346  UT lower;
347 
348  if (!__kmp_env_consistency_check) {
349  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
350  th->th.th_dispatch->th_dispatch_pr_current);
351  }
352  lower = pr->u.p.ordered_lower;
353 
354 #if !defined(KMP_GOMP_COMPAT)
355  if (__kmp_env_consistency_check) {
356  if (pr->ordered_bumped) {
357  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
358  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
359  ct_ordered_in_pdo, loc_ref,
360  &p->stack_data[p->w_top]);
361  }
362  }
363 #endif /* !defined(KMP_GOMP_COMPAT) */
364 
365  KMP_MB();
366 #ifdef KMP_DEBUG
367  {
368  char *buff;
369  // create format specifiers before the debug output
370  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
371  "ordered_iter:%%%s lower:%%%s\n",
372  traits_t<UT>::spec, traits_t<UT>::spec);
373  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
374  __kmp_str_free(&buff);
375  }
376 #endif
377  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
378  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
379  KMP_MB(); /* is this necessary? */
380 #ifdef KMP_DEBUG
381  {
382  char *buff;
383  // create format specifiers before the debug output
384  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
385  "ordered_iter:%%%s lower:%%%s\n",
386  traits_t<UT>::spec, traits_t<UT>::spec);
387  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
388  __kmp_str_free(&buff);
389  }
390 #endif
391  }
392  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
393 }
394 
395 template <typename UT>
396 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
397  typedef typename traits_t<UT>::signed_t ST;
398  dispatch_private_info_template<UT> *pr;
399 
400  int gtid = *gtid_ref;
401  // int cid = *cid_ref;
402  kmp_info_t *th = __kmp_threads[gtid];
403  KMP_DEBUG_ASSERT(th->th.th_dispatch);
404 
405  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
406  if (__kmp_env_consistency_check) {
407  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
408  th->th.th_dispatch->th_dispatch_pr_current);
409  if (pr->pushed_ws != ct_none) {
410  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
411  }
412  }
413 
414  if (!th->th.th_team->t.t_serialized) {
415  dispatch_shared_info_template<UT> *sh =
416  reinterpret_cast<dispatch_shared_info_template<UT> *>(
417  th->th.th_dispatch->th_dispatch_sh_current);
418 
419  if (!__kmp_env_consistency_check) {
420  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
421  th->th.th_dispatch->th_dispatch_pr_current);
422  }
423 
424  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
425 #if !defined(KMP_GOMP_COMPAT)
426  if (__kmp_env_consistency_check) {
427  if (pr->ordered_bumped != 0) {
428  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
429  /* How to test it? - OM */
430  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
431  ct_ordered_in_pdo, loc_ref,
432  &p->stack_data[p->w_top]);
433  }
434  }
435 #endif /* !defined(KMP_GOMP_COMPAT) */
436 
437  KMP_MB(); /* Flush all pending memory write invalidates. */
438 
439  pr->ordered_bumped += 1;
440 
441  KD_TRACE(1000,
442  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
443  gtid, pr->ordered_bumped));
444 
445  KMP_MB(); /* Flush all pending memory write invalidates. */
446 
447  /* TODO use general release procedure? */
448  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
449 
450  KMP_MB(); /* Flush all pending memory write invalidates. */
451  }
452  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
453 }
454 
455 /* Computes and returns x to the power of y, where y must a non-negative integer
456  */
457 template <typename UT>
458 static __forceinline long double __kmp_pow(long double x, UT y) {
459  long double s = 1.0L;
460 
461  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
462  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
463  while (y) {
464  if (y & 1)
465  s *= x;
466  x *= x;
467  y >>= 1;
468  }
469  return s;
470 }
471 
472 /* Computes and returns the number of unassigned iterations after idx chunks
473  have been assigned
474  (the total number of unassigned iterations in chunks with index greater than
475  or equal to idx).
476  __forceinline seems to be broken so that if we __forceinline this function,
477  the behavior is wrong
478  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
479 */
480 template <typename T>
481 static __inline typename traits_t<T>::unsigned_t
482 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
483  typename traits_t<T>::unsigned_t idx) {
484  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
485  least for ICL 8.1, long double arithmetic may not really have
486  long double precision, even with /Qlong_double. Currently, we
487  workaround that in the caller code, by manipulating the FPCW for
488  Windows* OS on IA-32 architecture. The lack of precision is not
489  expected to be a correctness issue, though.
490  */
491  typedef typename traits_t<T>::unsigned_t UT;
492 
493  long double x = tc * __kmp_pow<UT>(base, idx);
494  UT r = (UT)x;
495  if (x == r)
496  return r;
497  return r + 1;
498 }
499 
500 // Parameters of the guided-iterative algorithm:
501 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
502 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
503 // by default n = 2. For example with n = 3 the chunks distribution will be more
504 // flat.
505 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
506 static const int guided_int_param = 2;
507 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
508 #endif // KMP_DISPATCH_H
sched_type
Definition: kmp.h:336
Definition: kmp.h:223