LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  * it may change values between parallel regions. __kmp_max_nth
17  * is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44 
45  KMP_DEBUG_ASSERT(gtid_ref);
46 
47  if (__kmp_env_consistency_check) {
48  th = __kmp_threads[*gtid_ref];
49  if (th->th.th_root->r.r_active &&
50  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56  }
57  }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62 
63  if (__kmp_env_consistency_check) {
64  th = __kmp_threads[*gtid_ref];
65  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67  }
68  }
69 }
70 
71 // Initialize a dispatch_private_info_template<T> buffer for a particular
72 // type of schedule,chunk. The loop description is found in lb (lower bound),
73 // ub (upper bound), and st (stride). nproc is the number of threads relevant
74 // to the scheduling (often the number of threads in a team, but not always if
75 // hierarchical scheduling is used). tid is the id of the thread calling
76 // the function within the group of nproc threads. It will have a value
77 // between 0 and nproc - 1. This is often just the thread id within a team, but
78 // is not necessarily the case when using hierarchical scheduling.
79 // loc is the source file location of the corresponding loop
80 // gtid is the global thread id
81 template <typename T>
82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
83  dispatch_private_info_template<T> *pr,
84  enum sched_type schedule, T lb, T ub,
85  typename traits_t<T>::signed_t st,
86 #if USE_ITT_BUILD
87  kmp_uint64 *cur_chunk,
88 #endif
89  typename traits_t<T>::signed_t chunk,
90  T nproc, T tid) {
91  typedef typename traits_t<T>::unsigned_t UT;
92  typedef typename traits_t<T>::floating_t DBL;
93 
94  int active;
95  T tc;
96  kmp_info_t *th;
97  kmp_team_t *team;
98 
99 #ifdef KMP_DEBUG
100  typedef typename traits_t<T>::signed_t ST;
101  {
102  char *buff;
103  // create format specifiers before the debug output
104  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
105  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
106  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
107  traits_t<T>::spec, traits_t<T>::spec,
108  traits_t<ST>::spec, traits_t<ST>::spec,
109  traits_t<T>::spec, traits_t<T>::spec);
110  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
111  __kmp_str_free(&buff);
112  }
113 #endif
114  /* setup data */
115  th = __kmp_threads[gtid];
116  team = th->th.th_team;
117  active = !team->t.t_serialized;
118 
119 #if USE_ITT_BUILD
120  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
121  __kmp_forkjoin_frames_mode == 3 &&
122  KMP_MASTER_GTID(gtid) &&
123 #if OMP_40_ENABLED
124  th->th.th_teams_microtask == NULL &&
125 #endif
126  team->t.t_active_level == 1;
127 #endif
128 #if (KMP_STATIC_STEAL_ENABLED)
129  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
130  // AC: we now have only one implementation of stealing, so use it
131  schedule = kmp_sch_static_steal;
132  else
133 #endif
134  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
135 
136  /* Pick up the nomerge/ordered bits from the scheduling type */
137  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
138  pr->flags.nomerge = TRUE;
139  schedule =
140  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
141  } else {
142  pr->flags.nomerge = FALSE;
143  }
144  pr->type_size = traits_t<T>::type_size; // remember the size of variables
145  if (kmp_ord_lower & schedule) {
146  pr->flags.ordered = TRUE;
147  schedule =
148  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
149  } else {
150  pr->flags.ordered = FALSE;
151  }
152 
153  if (schedule == kmp_sch_static) {
154  schedule = __kmp_static;
155  } else {
156  if (schedule == kmp_sch_runtime) {
157  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
158  // not specified)
159  schedule = team->t.t_sched.r_sched_type;
160  // Detail the schedule if needed (global controls are differentiated
161  // appropriately)
162  if (schedule == kmp_sch_guided_chunked) {
163  schedule = __kmp_guided;
164  } else if (schedule == kmp_sch_static) {
165  schedule = __kmp_static;
166  }
167  // Use the chunk size specified by OMP_SCHEDULE (or default if not
168  // specified)
169  chunk = team->t.t_sched.chunk;
170 #if USE_ITT_BUILD
171  if (cur_chunk)
172  *cur_chunk = chunk;
173 #endif
174 #ifdef KMP_DEBUG
175  {
176  char *buff;
177  // create format specifiers before the debug output
178  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
179  "schedule:%%d chunk:%%%s\n",
180  traits_t<ST>::spec);
181  KD_TRACE(10, (buff, gtid, schedule, chunk));
182  __kmp_str_free(&buff);
183  }
184 #endif
185  } else {
186  if (schedule == kmp_sch_guided_chunked) {
187  schedule = __kmp_guided;
188  }
189  if (chunk <= 0) {
190  chunk = KMP_DEFAULT_CHUNK;
191  }
192  }
193 
194  if (schedule == kmp_sch_auto) {
195  // mapping and differentiation: in the __kmp_do_serial_initialize()
196  schedule = __kmp_auto;
197 #ifdef KMP_DEBUG
198  {
199  char *buff;
200  // create format specifiers before the debug output
201  buff = __kmp_str_format(
202  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
203  "schedule:%%d chunk:%%%s\n",
204  traits_t<ST>::spec);
205  KD_TRACE(10, (buff, gtid, schedule, chunk));
206  __kmp_str_free(&buff);
207  }
208 #endif
209  }
210 
211  /* guided analytical not safe for too many threads */
212  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
213  schedule = kmp_sch_guided_iterative_chunked;
214  KMP_WARNING(DispatchManyThreads);
215  }
216 #if OMP_45_ENABLED
217  if (schedule == kmp_sch_runtime_simd) {
218  // compiler provides simd_width in the chunk parameter
219  schedule = team->t.t_sched.r_sched_type;
220  // Detail the schedule if needed (global controls are differentiated
221  // appropriately)
222  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
223  schedule == __kmp_static) {
224  schedule = kmp_sch_static_balanced_chunked;
225  } else {
226  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
227  schedule = kmp_sch_guided_simd;
228  }
229  chunk = team->t.t_sched.chunk * chunk;
230  }
231 #if USE_ITT_BUILD
232  if (cur_chunk)
233  *cur_chunk = chunk;
234 #endif
235 #ifdef KMP_DEBUG
236  {
237  char *buff;
238  // create format specifiers before the debug output
239  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
240  " chunk:%%%s\n",
241  traits_t<ST>::spec);
242  KD_TRACE(10, (buff, gtid, schedule, chunk));
243  __kmp_str_free(&buff);
244  }
245 #endif
246  }
247 #endif // OMP_45_ENABLED
248  pr->u.p.parm1 = chunk;
249  }
250  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
251  "unknown scheduling type");
252 
253  pr->u.p.count = 0;
254 
255  if (__kmp_env_consistency_check) {
256  if (st == 0) {
257  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
258  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
259  }
260  }
261  // compute trip count
262  if (st == 1) { // most common case
263  if (ub >= lb) {
264  tc = ub - lb + 1;
265  } else { // ub < lb
266  tc = 0; // zero-trip
267  }
268  } else if (st < 0) {
269  if (lb >= ub) {
270  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
271  // where the division needs to be unsigned regardless of the result type
272  tc = (UT)(lb - ub) / (-st) + 1;
273  } else { // lb < ub
274  tc = 0; // zero-trip
275  }
276  } else { // st > 0
277  if (ub >= lb) {
278  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
279  // where the division needs to be unsigned regardless of the result type
280  tc = (UT)(ub - lb) / st + 1;
281  } else { // ub < lb
282  tc = 0; // zero-trip
283  }
284  }
285 
286 #if KMP_STATS_ENABLED
287  if (KMP_MASTER_GTID(gtid)) {
288  KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
289  }
290 #endif
291 
292  pr->u.p.lb = lb;
293  pr->u.p.ub = ub;
294  pr->u.p.st = st;
295  pr->u.p.tc = tc;
296 
297 #if KMP_OS_WINDOWS
298  pr->u.p.last_upper = ub + st;
299 #endif /* KMP_OS_WINDOWS */
300 
301  /* NOTE: only the active parallel region(s) has active ordered sections */
302 
303  if (active) {
304  if (pr->flags.ordered) {
305  pr->ordered_bumped = 0;
306  pr->u.p.ordered_lower = 1;
307  pr->u.p.ordered_upper = 0;
308  }
309  }
310 
311  switch (schedule) {
312 #if (KMP_STATIC_STEAL_ENABLED)
313  case kmp_sch_static_steal: {
314  T ntc, init;
315 
316  KD_TRACE(100,
317  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
318  gtid));
319 
320  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
321  if (nproc > 1 && ntc >= nproc) {
322  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
323  T id = tid;
324  T small_chunk, extras;
325 
326  small_chunk = ntc / nproc;
327  extras = ntc % nproc;
328 
329  init = id * small_chunk + (id < extras ? id : extras);
330  pr->u.p.count = init;
331  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
332 
333  pr->u.p.parm2 = lb;
334  // pr->pfields.parm3 = 0; // it's not used in static_steal
335  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
336  pr->u.p.st = st;
337  if (traits_t<T>::type_size > 4) {
338  // AC: TODO: check if 16-byte CAS available and use it to
339  // improve performance (probably wait for explicit request
340  // before spending time on this).
341  // For now use dynamically allocated per-thread lock,
342  // free memory in __kmp_dispatch_next when status==0.
343  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
344  th->th.th_dispatch->th_steal_lock =
345  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
346  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
347  }
348  break;
349  } else {
350  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
351  "kmp_sch_static_balanced\n",
352  gtid));
353  schedule = kmp_sch_static_balanced;
354  /* too few iterations: fall-through to kmp_sch_static_balanced */
355  } // if
356  /* FALL-THROUGH to static balanced */
357  KMP_FALLTHROUGH();
358  } // case
359 #endif
360  case kmp_sch_static_balanced: {
361  T init, limit;
362 
363  KD_TRACE(
364  100,
365  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
366  gtid));
367 
368  if (nproc > 1) {
369  T id = tid;
370 
371  if (tc < nproc) {
372  if (id < tc) {
373  init = id;
374  limit = id;
375  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
376  } else {
377  pr->u.p.count = 1; /* means no more chunks to execute */
378  pr->u.p.parm1 = FALSE;
379  break;
380  }
381  } else {
382  T small_chunk = tc / nproc;
383  T extras = tc % nproc;
384  init = id * small_chunk + (id < extras ? id : extras);
385  limit = init + small_chunk - (id < extras ? 0 : 1);
386  pr->u.p.parm1 = (id == nproc - 1);
387  }
388  } else {
389  if (tc > 0) {
390  init = 0;
391  limit = tc - 1;
392  pr->u.p.parm1 = TRUE;
393  } else {
394  // zero trip count
395  pr->u.p.count = 1; /* means no more chunks to execute */
396  pr->u.p.parm1 = FALSE;
397  break;
398  }
399  }
400 #if USE_ITT_BUILD
401  // Calculate chunk for metadata report
402  if (itt_need_metadata_reporting)
403  if (cur_chunk)
404  *cur_chunk = limit - init + 1;
405 #endif
406  if (st == 1) {
407  pr->u.p.lb = lb + init;
408  pr->u.p.ub = lb + limit;
409  } else {
410  // calculated upper bound, "ub" is user-defined upper bound
411  T ub_tmp = lb + limit * st;
412  pr->u.p.lb = lb + init * st;
413  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
414  // it exactly
415  if (st > 0) {
416  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
417  } else {
418  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
419  }
420  }
421  if (pr->flags.ordered) {
422  pr->u.p.ordered_lower = init;
423  pr->u.p.ordered_upper = limit;
424  }
425  break;
426  } // case
427 #if OMP_45_ENABLED
428  case kmp_sch_static_balanced_chunked: {
429  // similar to balanced, but chunk adjusted to multiple of simd width
430  T nth = nproc;
431  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
432  " -> falling-through to static_greedy\n",
433  gtid));
434  schedule = kmp_sch_static_greedy;
435  if (nth > 1)
436  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
437  else
438  pr->u.p.parm1 = tc;
439  break;
440  } // case
441  case kmp_sch_guided_simd:
442 #endif // OMP_45_ENABLED
443  case kmp_sch_guided_iterative_chunked: {
444  KD_TRACE(
445  100,
446  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
447  " case\n",
448  gtid));
449 
450  if (nproc > 1) {
451  if ((2L * chunk + 1) * nproc >= tc) {
452  /* chunk size too large, switch to dynamic */
453  schedule = kmp_sch_dynamic_chunked;
454  } else {
455  // when remaining iters become less than parm2 - switch to dynamic
456  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
457  *(double *)&pr->u.p.parm3 =
458  guided_flt_param / nproc; // may occupy parm3 and parm4
459  }
460  } else {
461  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
462  "kmp_sch_static_greedy\n",
463  gtid));
464  schedule = kmp_sch_static_greedy;
465  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
466  KD_TRACE(
467  100,
468  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
469  gtid));
470  pr->u.p.parm1 = tc;
471  } // if
472  } // case
473  break;
474  case kmp_sch_guided_analytical_chunked: {
475  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
476  "kmp_sch_guided_analytical_chunked case\n",
477  gtid));
478 
479  if (nproc > 1) {
480  if ((2L * chunk + 1) * nproc >= tc) {
481  /* chunk size too large, switch to dynamic */
482  schedule = kmp_sch_dynamic_chunked;
483  } else {
484  /* commonly used term: (2 nproc - 1)/(2 nproc) */
485  DBL x;
486 
487 #if KMP_USE_X87CONTROL
488  /* Linux* OS already has 64-bit computation by default for long double,
489  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
490  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
491  instead of the default 53-bit. Even though long double doesn't work
492  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
493  expected to impact the correctness of the algorithm, but this has not
494  been mathematically proven. */
495  // save original FPCW and set precision to 64-bit, as
496  // Windows* OS on IA-32 architecture defaults to 53-bit
497  unsigned int oldFpcw = _control87(0, 0);
498  _control87(_PC_64, _MCW_PC); // 0,0x30000
499 #endif
500  /* value used for comparison in solver for cross-over point */
501  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
502 
503  /* crossover point--chunk indexes equal to or greater than
504  this point switch to dynamic-style scheduling */
505  UT cross;
506 
507  /* commonly used term: (2 nproc - 1)/(2 nproc) */
508  x = (long double)1.0 - (long double)0.5 / nproc;
509 
510 #ifdef KMP_DEBUG
511  { // test natural alignment
512  struct _test_a {
513  char a;
514  union {
515  char b;
516  DBL d;
517  };
518  } t;
519  ptrdiff_t natural_alignment =
520  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
521  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
522  // long)natural_alignment );
523  KMP_DEBUG_ASSERT(
524  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
525  }
526 #endif // KMP_DEBUG
527 
528  /* save the term in thread private dispatch structure */
529  *(DBL *)&pr->u.p.parm3 = x;
530 
531  /* solve for the crossover point to the nearest integer i for which C_i
532  <= chunk */
533  {
534  UT left, right, mid;
535  long double p;
536 
537  /* estimate initial upper and lower bound */
538 
539  /* doesn't matter what value right is as long as it is positive, but
540  it affects performance of the solver */
541  right = 229;
542  p = __kmp_pow<UT>(x, right);
543  if (p > target) {
544  do {
545  p *= p;
546  right <<= 1;
547  } while (p > target && right < (1 << 27));
548  /* lower bound is previous (failed) estimate of upper bound */
549  left = right >> 1;
550  } else {
551  left = 0;
552  }
553 
554  /* bisection root-finding method */
555  while (left + 1 < right) {
556  mid = (left + right) / 2;
557  if (__kmp_pow<UT>(x, mid) > target) {
558  left = mid;
559  } else {
560  right = mid;
561  }
562  } // while
563  cross = right;
564  }
565  /* assert sanity of computed crossover point */
566  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
567  __kmp_pow<UT>(x, cross) <= target);
568 
569  /* save the crossover point in thread private dispatch structure */
570  pr->u.p.parm2 = cross;
571 
572 // C75803
573 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
574 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
575 #else
576 #define GUIDED_ANALYTICAL_WORKAROUND (x)
577 #endif
578  /* dynamic-style scheduling offset */
579  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
580  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
581  cross * chunk;
582 #if KMP_USE_X87CONTROL
583  // restore FPCW
584  _control87(oldFpcw, _MCW_PC);
585 #endif
586  } // if
587  } else {
588  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
589  "kmp_sch_static_greedy\n",
590  gtid));
591  schedule = kmp_sch_static_greedy;
592  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
593  pr->u.p.parm1 = tc;
594  } // if
595  } // case
596  break;
597  case kmp_sch_static_greedy:
598  KD_TRACE(
599  100,
600  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
601  gtid));
602  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
603  break;
604  case kmp_sch_static_chunked:
605  case kmp_sch_dynamic_chunked:
606  if (pr->u.p.parm1 <= 0) {
607  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
608  }
609  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
610  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
611  gtid));
612  break;
613  case kmp_sch_trapezoidal: {
614  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
615 
616  T parm1, parm2, parm3, parm4;
617  KD_TRACE(100,
618  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
619  gtid));
620 
621  parm1 = chunk;
622 
623  /* F : size of the first cycle */
624  parm2 = (tc / (2 * nproc));
625 
626  if (parm2 < 1) {
627  parm2 = 1;
628  }
629 
630  /* L : size of the last cycle. Make sure the last cycle is not larger
631  than the first cycle. */
632  if (parm1 < 1) {
633  parm1 = 1;
634  } else if (parm1 > parm2) {
635  parm1 = parm2;
636  }
637 
638  /* N : number of cycles */
639  parm3 = (parm2 + parm1);
640  parm3 = (2 * tc + parm3 - 1) / parm3;
641 
642  if (parm3 < 2) {
643  parm3 = 2;
644  }
645 
646  /* sigma : decreasing incr of the trapezoid */
647  parm4 = (parm3 - 1);
648  parm4 = (parm2 - parm1) / parm4;
649 
650  // pointless check, because parm4 >= 0 always
651  // if ( parm4 < 0 ) {
652  // parm4 = 0;
653  //}
654 
655  pr->u.p.parm1 = parm1;
656  pr->u.p.parm2 = parm2;
657  pr->u.p.parm3 = parm3;
658  pr->u.p.parm4 = parm4;
659  } // case
660  break;
661 
662  default: {
663  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
664  KMP_HNT(GetNewerLibrary), // Hint
665  __kmp_msg_null // Variadic argument list terminator
666  );
667  } break;
668  } // switch
669  pr->schedule = schedule;
670 }
671 
672 #if KMP_USE_HIER_SCHED
673 template <typename T>
674 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
675  typename traits_t<T>::signed_t st);
676 template <>
677 inline void
678 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
679  kmp_int32 ub, kmp_int32 st) {
680  __kmp_dispatch_init_hierarchy<kmp_int32>(
681  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
682  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
683 }
684 template <>
685 inline void
686 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
687  kmp_uint32 ub, kmp_int32 st) {
688  __kmp_dispatch_init_hierarchy<kmp_uint32>(
689  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
690  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
691 }
692 template <>
693 inline void
694 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
695  kmp_int64 ub, kmp_int64 st) {
696  __kmp_dispatch_init_hierarchy<kmp_int64>(
697  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
698  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
699 }
700 template <>
701 inline void
702 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
703  kmp_uint64 ub, kmp_int64 st) {
704  __kmp_dispatch_init_hierarchy<kmp_uint64>(
705  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
706  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
707 }
708 
709 // free all the hierarchy scheduling memory associated with the team
710 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
711  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
712  for (int i = 0; i < num_disp_buff; ++i) {
713  // type does not matter here so use kmp_int32
714  auto sh =
715  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
716  &team->t.t_disp_buffer[i]);
717  if (sh->hier) {
718  sh->hier->deallocate();
719  __kmp_free(sh->hier);
720  }
721  }
722 }
723 #endif
724 
725 // UT - unsigned flavor of T, ST - signed flavor of T,
726 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
727 template <typename T>
728 static void
729 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
730  T ub, typename traits_t<T>::signed_t st,
731  typename traits_t<T>::signed_t chunk, int push_ws) {
732  typedef typename traits_t<T>::unsigned_t UT;
733 
734  int active;
735  kmp_info_t *th;
736  kmp_team_t *team;
737  kmp_uint32 my_buffer_index;
738  dispatch_private_info_template<T> *pr;
739  dispatch_shared_info_template<T> volatile *sh;
740 
741  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
742  sizeof(dispatch_private_info));
743  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
744  sizeof(dispatch_shared_info));
745 
746  if (!TCR_4(__kmp_init_parallel))
747  __kmp_parallel_initialize();
748 
749 #if OMP_50_ENABLED
750  __kmp_resume_if_soft_paused();
751 #endif
752 
753 #if INCLUDE_SSC_MARKS
754  SSC_MARK_DISPATCH_INIT();
755 #endif
756 #ifdef KMP_DEBUG
757  typedef typename traits_t<T>::signed_t ST;
758  {
759  char *buff;
760  // create format specifiers before the debug output
761  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
762  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
763  traits_t<ST>::spec, traits_t<T>::spec,
764  traits_t<T>::spec, traits_t<ST>::spec);
765  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
766  __kmp_str_free(&buff);
767  }
768 #endif
769  /* setup data */
770  th = __kmp_threads[gtid];
771  team = th->th.th_team;
772  active = !team->t.t_serialized;
773  th->th.th_ident = loc;
774 
775  // Any half-decent optimizer will remove this test when the blocks are empty
776  // since the macros expand to nothing
777  // when statistics are disabled.
778  if (schedule == __kmp_static) {
779  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
780  } else {
781  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
782  }
783 
784 #if KMP_USE_HIER_SCHED
785  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
786  // Hierarchical scheduling does not work with ordered, so if ordered is
787  // detected, then revert back to threaded scheduling.
788  bool ordered;
789  enum sched_type my_sched = schedule;
790  my_buffer_index = th->th.th_dispatch->th_disp_index;
791  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
792  &th->th.th_dispatch
793  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
794  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
795  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
796  my_sched =
797  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
798  ordered = (kmp_ord_lower & my_sched);
799  if (pr->flags.use_hier) {
800  if (ordered) {
801  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
802  "Disabling hierarchical scheduling.\n",
803  gtid));
804  pr->flags.use_hier = FALSE;
805  }
806  }
807  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
808  // Don't use hierarchical for ordered parallel loops and don't
809  // use the runtime hierarchy if one was specified in the program
810  if (!ordered && !pr->flags.use_hier)
811  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
812  }
813 #endif // KMP_USE_HIER_SCHED
814 
815 #if USE_ITT_BUILD
816  kmp_uint64 cur_chunk = chunk;
817  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
818  __kmp_forkjoin_frames_mode == 3 &&
819  KMP_MASTER_GTID(gtid) &&
820 #if OMP_40_ENABLED
821  th->th.th_teams_microtask == NULL &&
822 #endif
823  team->t.t_active_level == 1;
824 #endif
825  if (!active) {
826  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
827  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
828  } else {
829  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
830  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
831 
832  my_buffer_index = th->th.th_dispatch->th_disp_index++;
833 
834  /* What happens when number of threads changes, need to resize buffer? */
835  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
836  &th->th.th_dispatch
837  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
838  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
839  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
840  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
841  my_buffer_index));
842  }
843 
844  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
845 #if USE_ITT_BUILD
846  &cur_chunk,
847 #endif
848  chunk, (T)th->th.th_team_nproc,
849  (T)th->th.th_info.ds.ds_tid);
850  if (active) {
851  if (pr->flags.ordered == 0) {
852  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
853  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
854  } else {
855  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
856  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
857  }
858  }
859 
860  if (active) {
861  /* The name of this buffer should be my_buffer_index when it's free to use
862  * it */
863 
864  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
865  "sh->buffer_index:%d\n",
866  gtid, my_buffer_index, sh->buffer_index));
867  __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
868  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
869  // Note: KMP_WAIT() cannot be used there: buffer index and
870  // my_buffer_index are *always* 32-bit integers.
871  KMP_MB(); /* is this necessary? */
872  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
873  "sh->buffer_index:%d\n",
874  gtid, my_buffer_index, sh->buffer_index));
875 
876  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
877  th->th.th_dispatch->th_dispatch_sh_current =
878  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
879 #if USE_ITT_BUILD
880  if (pr->flags.ordered) {
881  __kmp_itt_ordered_init(gtid);
882  }
883  // Report loop metadata
884  if (itt_need_metadata_reporting) {
885  // Only report metadata by master of active team at level 1
886  kmp_uint64 schedtype = 0;
887  switch (schedule) {
888  case kmp_sch_static_chunked:
889  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
890  break;
891  case kmp_sch_static_greedy:
892  cur_chunk = pr->u.p.parm1;
893  break;
894  case kmp_sch_dynamic_chunked:
895  schedtype = 1;
896  break;
897  case kmp_sch_guided_iterative_chunked:
898  case kmp_sch_guided_analytical_chunked:
899 #if OMP_45_ENABLED
900  case kmp_sch_guided_simd:
901 #endif
902  schedtype = 2;
903  break;
904  default:
905  // Should we put this case under "static"?
906  // case kmp_sch_static_steal:
907  schedtype = 3;
908  break;
909  }
910  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
911  }
912 #if KMP_USE_HIER_SCHED
913  if (pr->flags.use_hier) {
914  pr->u.p.count = 0;
915  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
916  }
917 #endif // KMP_USER_HIER_SCHED
918 #endif /* USE_ITT_BUILD */
919  }
920 
921 #ifdef KMP_DEBUG
922  {
923  char *buff;
924  // create format specifiers before the debug output
925  buff = __kmp_str_format(
926  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
927  "lb:%%%s ub:%%%s"
928  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
929  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
930  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
931  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
932  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
933  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
934  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
935  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
936  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
937  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
938  __kmp_str_free(&buff);
939  }
940 #endif
941 #if (KMP_STATIC_STEAL_ENABLED)
942  // It cannot be guaranteed that after execution of a loop with some other
943  // schedule kind all the parm3 variables will contain the same value. Even if
944  // all parm3 will be the same, it still exists a bad case like using 0 and 1
945  // rather than program life-time increment. So the dedicated variable is
946  // required. The 'static_steal_counter' is used.
947  if (schedule == kmp_sch_static_steal) {
948  // Other threads will inspect this variable when searching for a victim.
949  // This is a flag showing that other threads may steal from this thread
950  // since then.
951  volatile T *p = &pr->u.p.static_steal_counter;
952  *p = *p + 1;
953  }
954 #endif // ( KMP_STATIC_STEAL_ENABLED )
955 
956 #if OMPT_SUPPORT && OMPT_OPTIONAL
957  if (ompt_enabled.ompt_callback_work) {
958  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
959  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
960  ompt_callbacks.ompt_callback(ompt_callback_work)(
961  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
962  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
963  }
964 #endif
965  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
966 }
967 
968 /* For ordered loops, either __kmp_dispatch_finish() should be called after
969  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
970  * every chunk of iterations. If the ordered section(s) were not executed
971  * for this iteration (or every iteration in this chunk), we need to set the
972  * ordered iteration counters so that the next thread can proceed. */
973 template <typename UT>
974 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
975  typedef typename traits_t<UT>::signed_t ST;
976  kmp_info_t *th = __kmp_threads[gtid];
977 
978  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
979  if (!th->th.th_team->t.t_serialized) {
980 
981  dispatch_private_info_template<UT> *pr =
982  reinterpret_cast<dispatch_private_info_template<UT> *>(
983  th->th.th_dispatch->th_dispatch_pr_current);
984  dispatch_shared_info_template<UT> volatile *sh =
985  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
986  th->th.th_dispatch->th_dispatch_sh_current);
987  KMP_DEBUG_ASSERT(pr);
988  KMP_DEBUG_ASSERT(sh);
989  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
990  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
991 
992  if (pr->ordered_bumped) {
993  KD_TRACE(
994  1000,
995  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
996  gtid));
997  pr->ordered_bumped = 0;
998  } else {
999  UT lower = pr->u.p.ordered_lower;
1000 
1001 #ifdef KMP_DEBUG
1002  {
1003  char *buff;
1004  // create format specifiers before the debug output
1005  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1006  "ordered_iteration:%%%s lower:%%%s\n",
1007  traits_t<UT>::spec, traits_t<UT>::spec);
1008  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1009  __kmp_str_free(&buff);
1010  }
1011 #endif
1012 
1013  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1014  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1015  KMP_MB(); /* is this necessary? */
1016 #ifdef KMP_DEBUG
1017  {
1018  char *buff;
1019  // create format specifiers before the debug output
1020  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1021  "ordered_iteration:%%%s lower:%%%s\n",
1022  traits_t<UT>::spec, traits_t<UT>::spec);
1023  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1024  __kmp_str_free(&buff);
1025  }
1026 #endif
1027 
1028  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1029  } // if
1030  } // if
1031  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1032 }
1033 
1034 #ifdef KMP_GOMP_COMPAT
1035 
1036 template <typename UT>
1037 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1038  typedef typename traits_t<UT>::signed_t ST;
1039  kmp_info_t *th = __kmp_threads[gtid];
1040 
1041  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1042  if (!th->th.th_team->t.t_serialized) {
1043  // int cid;
1044  dispatch_private_info_template<UT> *pr =
1045  reinterpret_cast<dispatch_private_info_template<UT> *>(
1046  th->th.th_dispatch->th_dispatch_pr_current);
1047  dispatch_shared_info_template<UT> volatile *sh =
1048  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1049  th->th.th_dispatch->th_dispatch_sh_current);
1050  KMP_DEBUG_ASSERT(pr);
1051  KMP_DEBUG_ASSERT(sh);
1052  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1053  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1054 
1055  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1056  UT lower = pr->u.p.ordered_lower;
1057  UT upper = pr->u.p.ordered_upper;
1058  UT inc = upper - lower + 1;
1059 
1060  if (pr->ordered_bumped == inc) {
1061  KD_TRACE(
1062  1000,
1063  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1064  gtid));
1065  pr->ordered_bumped = 0;
1066  } else {
1067  inc -= pr->ordered_bumped;
1068 
1069 #ifdef KMP_DEBUG
1070  {
1071  char *buff;
1072  // create format specifiers before the debug output
1073  buff = __kmp_str_format(
1074  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1075  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1076  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1077  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1078  __kmp_str_free(&buff);
1079  }
1080 #endif
1081 
1082  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1083  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1084 
1085  KMP_MB(); /* is this necessary? */
1086  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1087  "ordered_bumped to zero\n",
1088  gtid));
1089  pr->ordered_bumped = 0;
1091 #ifdef KMP_DEBUG
1092  {
1093  char *buff;
1094  // create format specifiers before the debug output
1095  buff = __kmp_str_format(
1096  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1097  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1098  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1099  traits_t<UT>::spec);
1100  KD_TRACE(1000,
1101  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1102  __kmp_str_free(&buff);
1103  }
1104 #endif
1105 
1106  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1107  }
1108  // }
1109  }
1110  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1111 }
1112 
1113 #endif /* KMP_GOMP_COMPAT */
1114 
1115 template <typename T>
1116 int __kmp_dispatch_next_algorithm(int gtid,
1117  dispatch_private_info_template<T> *pr,
1118  dispatch_shared_info_template<T> volatile *sh,
1119  kmp_int32 *p_last, T *p_lb, T *p_ub,
1120  typename traits_t<T>::signed_t *p_st, T nproc,
1121  T tid) {
1122  typedef typename traits_t<T>::unsigned_t UT;
1123  typedef typename traits_t<T>::signed_t ST;
1124  typedef typename traits_t<T>::floating_t DBL;
1125  int status = 0;
1126  kmp_int32 last = 0;
1127  T start;
1128  ST incr;
1129  UT limit, trip, init;
1130  kmp_info_t *th = __kmp_threads[gtid];
1131  kmp_team_t *team = th->th.th_team;
1132 
1133  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1134  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1135  KMP_DEBUG_ASSERT(pr);
1136  KMP_DEBUG_ASSERT(sh);
1137  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1138 #ifdef KMP_DEBUG
1139  {
1140  char *buff;
1141  // create format specifiers before the debug output
1142  buff =
1143  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1144  "sh:%%p nproc:%%%s tid:%%%s\n",
1145  traits_t<T>::spec, traits_t<T>::spec);
1146  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1147  __kmp_str_free(&buff);
1148  }
1149 #endif
1150 
1151  // zero trip count
1152  if (pr->u.p.tc == 0) {
1153  KD_TRACE(10,
1154  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1155  "zero status:%d\n",
1156  gtid, status));
1157  return 0;
1158  }
1159 
1160  switch (pr->schedule) {
1161 #if (KMP_STATIC_STEAL_ENABLED)
1162  case kmp_sch_static_steal: {
1163  T chunk = pr->u.p.parm1;
1164 
1165  KD_TRACE(100,
1166  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1167  gtid));
1168 
1169  trip = pr->u.p.tc - 1;
1170 
1171  if (traits_t<T>::type_size > 4) {
1172  // use lock for 8-byte and CAS for 4-byte induction
1173  // variable. TODO (optional): check and use 16-byte CAS
1174  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1175  KMP_DEBUG_ASSERT(lck != NULL);
1176  if (pr->u.p.count < (UT)pr->u.p.ub) {
1177  __kmp_acquire_lock(lck, gtid);
1178  // try to get own chunk of iterations
1179  init = (pr->u.p.count)++;
1180  status = (init < (UT)pr->u.p.ub);
1181  __kmp_release_lock(lck, gtid);
1182  } else {
1183  status = 0; // no own chunks
1184  }
1185  if (!status) { // try to steal
1186  kmp_info_t **other_threads = team->t.t_threads;
1187  int while_limit = nproc; // nproc attempts to find a victim
1188  int while_index = 0;
1189  // TODO: algorithm of searching for a victim
1190  // should be cleaned up and measured
1191  while ((!status) && (while_limit != ++while_index)) {
1192  T remaining;
1193  T victimIdx = pr->u.p.parm4;
1194  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1195  dispatch_private_info_template<T> *victim =
1196  reinterpret_cast<dispatch_private_info_template<T> *>(
1197  other_threads[victimIdx]
1198  ->th.th_dispatch->th_dispatch_pr_current);
1199  while ((victim == NULL || victim == pr ||
1200  (*(volatile T *)&victim->u.p.static_steal_counter !=
1201  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1202  oldVictimIdx != victimIdx) {
1203  victimIdx = (victimIdx + 1) % nproc;
1204  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1205  other_threads[victimIdx]
1206  ->th.th_dispatch->th_dispatch_pr_current);
1207  }
1208  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1209  *(volatile T *)&pr->u.p.static_steal_counter)) {
1210  continue; // try once more (nproc attempts in total)
1211  // no victim is ready yet to participate in stealing
1212  // because all victims are still in kmp_init_dispatch
1213  }
1214  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1215  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1216  continue; // not enough chunks to steal, goto next victim
1217  }
1218 
1219  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1220  KMP_ASSERT(lck != NULL);
1221  __kmp_acquire_lock(lck, gtid);
1222  limit = victim->u.p.ub; // keep initial ub
1223  if (victim->u.p.count >= limit ||
1224  (remaining = limit - victim->u.p.count) < 2) {
1225  __kmp_release_lock(lck, gtid);
1226  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1227  continue; // not enough chunks to steal
1228  }
1229  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1230  // by 1
1231  if (remaining > 3) {
1232  // steal 1/4 of remaining
1233  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1234  init = (victim->u.p.ub -= (remaining >> 2));
1235  } else {
1236  // steal 1 chunk of 2 or 3 remaining
1237  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1238  init = (victim->u.p.ub -= 1);
1239  }
1240  __kmp_release_lock(lck, gtid);
1241 
1242  KMP_DEBUG_ASSERT(init + 1 <= limit);
1243  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1244  status = 1;
1245  while_index = 0;
1246  // now update own count and ub with stolen range but init chunk
1247  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1248  pr->u.p.count = init + 1;
1249  pr->u.p.ub = limit;
1250  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1251  } // while (search for victim)
1252  } // if (try to find victim and steal)
1253  } else {
1254  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1255  typedef union {
1256  struct {
1257  UT count;
1258  T ub;
1259  } p;
1260  kmp_int64 b;
1261  } union_i4;
1262  // All operations on 'count' or 'ub' must be combined atomically
1263  // together.
1264  {
1265  union_i4 vold, vnew;
1266  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1267  vnew = vold;
1268  vnew.p.count++;
1269  while (!KMP_COMPARE_AND_STORE_ACQ64(
1270  (volatile kmp_int64 *)&pr->u.p.count,
1271  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1272  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1273  KMP_CPU_PAUSE();
1274  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1275  vnew = vold;
1276  vnew.p.count++;
1277  }
1278  vnew = vold;
1279  init = vnew.p.count;
1280  status = (init < (UT)vnew.p.ub);
1281  }
1282 
1283  if (!status) {
1284  kmp_info_t **other_threads = team->t.t_threads;
1285  int while_limit = nproc; // nproc attempts to find a victim
1286  int while_index = 0;
1287 
1288  // TODO: algorithm of searching for a victim
1289  // should be cleaned up and measured
1290  while ((!status) && (while_limit != ++while_index)) {
1291  union_i4 vold, vnew;
1292  kmp_int32 remaining;
1293  T victimIdx = pr->u.p.parm4;
1294  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1295  dispatch_private_info_template<T> *victim =
1296  reinterpret_cast<dispatch_private_info_template<T> *>(
1297  other_threads[victimIdx]
1298  ->th.th_dispatch->th_dispatch_pr_current);
1299  while ((victim == NULL || victim == pr ||
1300  (*(volatile T *)&victim->u.p.static_steal_counter !=
1301  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1302  oldVictimIdx != victimIdx) {
1303  victimIdx = (victimIdx + 1) % nproc;
1304  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1305  other_threads[victimIdx]
1306  ->th.th_dispatch->th_dispatch_pr_current);
1307  }
1308  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1309  *(volatile T *)&pr->u.p.static_steal_counter)) {
1310  continue; // try once more (nproc attempts in total)
1311  // no victim is ready yet to participate in stealing
1312  // because all victims are still in kmp_init_dispatch
1313  }
1314  pr->u.p.parm4 = victimIdx; // new victim found
1315  while (1) { // CAS loop if victim has enough chunks to steal
1316  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1317  vnew = vold;
1318 
1319  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1320  if (vnew.p.count >= (UT)vnew.p.ub ||
1321  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1322  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1323  break; // not enough chunks to steal, goto next victim
1324  }
1325  if (remaining > 3) {
1326  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1327  } else {
1328  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1329  }
1330  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1331  // TODO: Should this be acquire or release?
1332  if (KMP_COMPARE_AND_STORE_ACQ64(
1333  (volatile kmp_int64 *)&victim->u.p.count,
1334  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1335  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1336  // stealing succedded
1337  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1338  vold.p.ub - vnew.p.ub);
1339  status = 1;
1340  while_index = 0;
1341  // now update own count and ub
1342  init = vnew.p.ub;
1343  vold.p.count = init + 1;
1344 #if KMP_ARCH_X86
1345  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1346 #else
1347  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1348 #endif
1349  break;
1350  } // if (check CAS result)
1351  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1352  } // while (try to steal from particular victim)
1353  } // while (search for victim)
1354  } // if (try to find victim and steal)
1355  } // if (4-byte induction variable)
1356  if (!status) {
1357  *p_lb = 0;
1358  *p_ub = 0;
1359  if (p_st != NULL)
1360  *p_st = 0;
1361  } else {
1362  start = pr->u.p.parm2;
1363  init *= chunk;
1364  limit = chunk + init - 1;
1365  incr = pr->u.p.st;
1366  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1367 
1368  KMP_DEBUG_ASSERT(init <= trip);
1369  if ((last = (limit >= trip)) != 0)
1370  limit = trip;
1371  if (p_st != NULL)
1372  *p_st = incr;
1373 
1374  if (incr == 1) {
1375  *p_lb = start + init;
1376  *p_ub = start + limit;
1377  } else {
1378  *p_lb = start + init * incr;
1379  *p_ub = start + limit * incr;
1380  }
1381 
1382  if (pr->flags.ordered) {
1383  pr->u.p.ordered_lower = init;
1384  pr->u.p.ordered_upper = limit;
1385  } // if
1386  } // if
1387  break;
1388  } // case
1389 #endif // ( KMP_STATIC_STEAL_ENABLED )
1390  case kmp_sch_static_balanced: {
1391  KD_TRACE(
1392  10,
1393  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1394  gtid));
1395  /* check if thread has any iteration to do */
1396  if ((status = !pr->u.p.count) != 0) {
1397  pr->u.p.count = 1;
1398  *p_lb = pr->u.p.lb;
1399  *p_ub = pr->u.p.ub;
1400  last = pr->u.p.parm1;
1401  if (p_st != NULL)
1402  *p_st = pr->u.p.st;
1403  } else { /* no iterations to do */
1404  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1405  }
1406  } // case
1407  break;
1408  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1409  merged here */
1410  case kmp_sch_static_chunked: {
1411  T parm1;
1412 
1413  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1414  "kmp_sch_static_[affinity|chunked] case\n",
1415  gtid));
1416  parm1 = pr->u.p.parm1;
1417 
1418  trip = pr->u.p.tc - 1;
1419  init = parm1 * (pr->u.p.count + tid);
1420 
1421  if ((status = (init <= trip)) != 0) {
1422  start = pr->u.p.lb;
1423  incr = pr->u.p.st;
1424  limit = parm1 + init - 1;
1425 
1426  if ((last = (limit >= trip)) != 0)
1427  limit = trip;
1428 
1429  if (p_st != NULL)
1430  *p_st = incr;
1431 
1432  pr->u.p.count += nproc;
1433 
1434  if (incr == 1) {
1435  *p_lb = start + init;
1436  *p_ub = start + limit;
1437  } else {
1438  *p_lb = start + init * incr;
1439  *p_ub = start + limit * incr;
1440  }
1441 
1442  if (pr->flags.ordered) {
1443  pr->u.p.ordered_lower = init;
1444  pr->u.p.ordered_upper = limit;
1445  } // if
1446  } // if
1447  } // case
1448  break;
1449 
1450  case kmp_sch_dynamic_chunked: {
1451  T chunk = pr->u.p.parm1;
1452 
1453  KD_TRACE(
1454  100,
1455  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1456  gtid));
1457 
1458  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1459  trip = pr->u.p.tc - 1;
1460 
1461  if ((status = (init <= trip)) == 0) {
1462  *p_lb = 0;
1463  *p_ub = 0;
1464  if (p_st != NULL)
1465  *p_st = 0;
1466  } else {
1467  start = pr->u.p.lb;
1468  limit = chunk + init - 1;
1469  incr = pr->u.p.st;
1470 
1471  if ((last = (limit >= trip)) != 0)
1472  limit = trip;
1473 
1474  if (p_st != NULL)
1475  *p_st = incr;
1476 
1477  if (incr == 1) {
1478  *p_lb = start + init;
1479  *p_ub = start + limit;
1480  } else {
1481  *p_lb = start + init * incr;
1482  *p_ub = start + limit * incr;
1483  }
1484 
1485  if (pr->flags.ordered) {
1486  pr->u.p.ordered_lower = init;
1487  pr->u.p.ordered_upper = limit;
1488  } // if
1489  } // if
1490  } // case
1491  break;
1492 
1493  case kmp_sch_guided_iterative_chunked: {
1494  T chunkspec = pr->u.p.parm1;
1495  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1496  "iterative case\n",
1497  gtid));
1498  trip = pr->u.p.tc;
1499  // Start atomic part of calculations
1500  while (1) {
1501  ST remaining; // signed, because can be < 0
1502  init = sh->u.s.iteration; // shared value
1503  remaining = trip - init;
1504  if (remaining <= 0) { // AC: need to compare with 0 first
1505  // nothing to do, don't try atomic op
1506  status = 0;
1507  break;
1508  }
1509  if ((T)remaining <
1510  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1511  // use dynamic-style shcedule
1512  // atomically inrement iterations, get old value
1513  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1514  (ST)chunkspec);
1515  remaining = trip - init;
1516  if (remaining <= 0) {
1517  status = 0; // all iterations got by other threads
1518  } else {
1519  // got some iterations to work on
1520  status = 1;
1521  if ((T)remaining > chunkspec) {
1522  limit = init + chunkspec - 1;
1523  } else {
1524  last = 1; // the last chunk
1525  limit = init + remaining - 1;
1526  } // if
1527  } // if
1528  break;
1529  } // if
1530  limit = init +
1531  (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1532  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1533  (ST)init, (ST)limit)) {
1534  // CAS was successful, chunk obtained
1535  status = 1;
1536  --limit;
1537  break;
1538  } // if
1539  } // while
1540  if (status != 0) {
1541  start = pr->u.p.lb;
1542  incr = pr->u.p.st;
1543  if (p_st != NULL)
1544  *p_st = incr;
1545  *p_lb = start + init * incr;
1546  *p_ub = start + limit * incr;
1547  if (pr->flags.ordered) {
1548  pr->u.p.ordered_lower = init;
1549  pr->u.p.ordered_upper = limit;
1550  } // if
1551  } else {
1552  *p_lb = 0;
1553  *p_ub = 0;
1554  if (p_st != NULL)
1555  *p_st = 0;
1556  } // if
1557  } // case
1558  break;
1559 
1560 #if OMP_45_ENABLED
1561  case kmp_sch_guided_simd: {
1562  // same as iterative but curr-chunk adjusted to be multiple of given
1563  // chunk
1564  T chunk = pr->u.p.parm1;
1565  KD_TRACE(100,
1566  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1567  gtid));
1568  trip = pr->u.p.tc;
1569  // Start atomic part of calculations
1570  while (1) {
1571  ST remaining; // signed, because can be < 0
1572  init = sh->u.s.iteration; // shared value
1573  remaining = trip - init;
1574  if (remaining <= 0) { // AC: need to compare with 0 first
1575  status = 0; // nothing to do, don't try atomic op
1576  break;
1577  }
1578  KMP_DEBUG_ASSERT(init % chunk == 0);
1579  // compare with K*nproc*(chunk+1), K=2 by default
1580  if ((T)remaining < pr->u.p.parm2) {
1581  // use dynamic-style shcedule
1582  // atomically inrement iterations, get old value
1583  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1584  (ST)chunk);
1585  remaining = trip - init;
1586  if (remaining <= 0) {
1587  status = 0; // all iterations got by other threads
1588  } else {
1589  // got some iterations to work on
1590  status = 1;
1591  if ((T)remaining > chunk) {
1592  limit = init + chunk - 1;
1593  } else {
1594  last = 1; // the last chunk
1595  limit = init + remaining - 1;
1596  } // if
1597  } // if
1598  break;
1599  } // if
1600  // divide by K*nproc
1601  UT span = remaining * (*(double *)&pr->u.p.parm3);
1602  UT rem = span % chunk;
1603  if (rem) // adjust so that span%chunk == 0
1604  span += chunk - rem;
1605  limit = init + span;
1606  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1607  (ST)init, (ST)limit)) {
1608  // CAS was successful, chunk obtained
1609  status = 1;
1610  --limit;
1611  break;
1612  } // if
1613  } // while
1614  if (status != 0) {
1615  start = pr->u.p.lb;
1616  incr = pr->u.p.st;
1617  if (p_st != NULL)
1618  *p_st = incr;
1619  *p_lb = start + init * incr;
1620  *p_ub = start + limit * incr;
1621  if (pr->flags.ordered) {
1622  pr->u.p.ordered_lower = init;
1623  pr->u.p.ordered_upper = limit;
1624  } // if
1625  } else {
1626  *p_lb = 0;
1627  *p_ub = 0;
1628  if (p_st != NULL)
1629  *p_st = 0;
1630  } // if
1631  } // case
1632  break;
1633 #endif // OMP_45_ENABLED
1634 
1635  case kmp_sch_guided_analytical_chunked: {
1636  T chunkspec = pr->u.p.parm1;
1637  UT chunkIdx;
1638 #if KMP_USE_X87CONTROL
1639  /* for storing original FPCW value for Windows* OS on
1640  IA-32 architecture 8-byte version */
1641  unsigned int oldFpcw;
1642  unsigned int fpcwSet = 0;
1643 #endif
1644  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1645  "kmp_sch_guided_analytical_chunked case\n",
1646  gtid));
1647 
1648  trip = pr->u.p.tc;
1649 
1650  KMP_DEBUG_ASSERT(nproc > 1);
1651  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1652 
1653  while (1) { /* this while loop is a safeguard against unexpected zero
1654  chunk sizes */
1655  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1656  if (chunkIdx >= (UT)pr->u.p.parm2) {
1657  --trip;
1658  /* use dynamic-style scheduling */
1659  init = chunkIdx * chunkspec + pr->u.p.count;
1660  /* need to verify init > 0 in case of overflow in the above
1661  * calculation */
1662  if ((status = (init > 0 && init <= trip)) != 0) {
1663  limit = init + chunkspec - 1;
1664 
1665  if ((last = (limit >= trip)) != 0)
1666  limit = trip;
1667  }
1668  break;
1669  } else {
1670 /* use exponential-style scheduling */
1671 /* The following check is to workaround the lack of long double precision on
1672  Windows* OS.
1673  This check works around the possible effect that init != 0 for chunkIdx == 0.
1674  */
1675 #if KMP_USE_X87CONTROL
1676  /* If we haven't already done so, save original
1677  FPCW and set precision to 64-bit, as Windows* OS
1678  on IA-32 architecture defaults to 53-bit */
1679  if (!fpcwSet) {
1680  oldFpcw = _control87(0, 0);
1681  _control87(_PC_64, _MCW_PC);
1682  fpcwSet = 0x30000;
1683  }
1684 #endif
1685  if (chunkIdx) {
1686  init = __kmp_dispatch_guided_remaining<T>(
1687  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1688  KMP_DEBUG_ASSERT(init);
1689  init = trip - init;
1690  } else
1691  init = 0;
1692  limit = trip - __kmp_dispatch_guided_remaining<T>(
1693  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1694  KMP_ASSERT(init <= limit);
1695  if (init < limit) {
1696  KMP_DEBUG_ASSERT(limit <= trip);
1697  --limit;
1698  status = 1;
1699  break;
1700  } // if
1701  } // if
1702  } // while (1)
1703 #if KMP_USE_X87CONTROL
1704  /* restore FPCW if necessary
1705  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1706  */
1707  if (fpcwSet && (oldFpcw & fpcwSet))
1708  _control87(oldFpcw, _MCW_PC);
1709 #endif
1710  if (status != 0) {
1711  start = pr->u.p.lb;
1712  incr = pr->u.p.st;
1713  if (p_st != NULL)
1714  *p_st = incr;
1715  *p_lb = start + init * incr;
1716  *p_ub = start + limit * incr;
1717  if (pr->flags.ordered) {
1718  pr->u.p.ordered_lower = init;
1719  pr->u.p.ordered_upper = limit;
1720  }
1721  } else {
1722  *p_lb = 0;
1723  *p_ub = 0;
1724  if (p_st != NULL)
1725  *p_st = 0;
1726  }
1727  } // case
1728  break;
1729 
1730  case kmp_sch_trapezoidal: {
1731  UT index;
1732  T parm2 = pr->u.p.parm2;
1733  T parm3 = pr->u.p.parm3;
1734  T parm4 = pr->u.p.parm4;
1735  KD_TRACE(100,
1736  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1737  gtid));
1738 
1739  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1740 
1741  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1742  trip = pr->u.p.tc - 1;
1743 
1744  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1745  *p_lb = 0;
1746  *p_ub = 0;
1747  if (p_st != NULL)
1748  *p_st = 0;
1749  } else {
1750  start = pr->u.p.lb;
1751  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1752  incr = pr->u.p.st;
1753 
1754  if ((last = (limit >= trip)) != 0)
1755  limit = trip;
1756 
1757  if (p_st != NULL)
1758  *p_st = incr;
1759 
1760  if (incr == 1) {
1761  *p_lb = start + init;
1762  *p_ub = start + limit;
1763  } else {
1764  *p_lb = start + init * incr;
1765  *p_ub = start + limit * incr;
1766  }
1767 
1768  if (pr->flags.ordered) {
1769  pr->u.p.ordered_lower = init;
1770  pr->u.p.ordered_upper = limit;
1771  } // if
1772  } // if
1773  } // case
1774  break;
1775  default: {
1776  status = 0; // to avoid complaints on uninitialized variable use
1777  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1778  KMP_HNT(GetNewerLibrary), // Hint
1779  __kmp_msg_null // Variadic argument list terminator
1780  );
1781  } break;
1782  } // switch
1783  if (p_last)
1784  *p_last = last;
1785 #ifdef KMP_DEBUG
1786  if (pr->flags.ordered) {
1787  char *buff;
1788  // create format specifiers before the debug output
1789  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1790  "ordered_lower:%%%s ordered_upper:%%%s\n",
1791  traits_t<UT>::spec, traits_t<UT>::spec);
1792  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1793  __kmp_str_free(&buff);
1794  }
1795  {
1796  char *buff;
1797  // create format specifiers before the debug output
1798  buff = __kmp_str_format(
1799  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1800  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1801  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1802  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1803  __kmp_str_free(&buff);
1804  }
1805 #endif
1806  return status;
1807 }
1808 
1809 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1810  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1811  is not called. */
1812 #if OMPT_SUPPORT && OMPT_OPTIONAL
1813 #define OMPT_LOOP_END \
1814  if (status == 0) { \
1815  if (ompt_enabled.ompt_callback_work) { \
1816  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1817  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1818  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1819  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1820  &(task_info->task_data), 0, codeptr); \
1821  } \
1822  }
1823 // TODO: implement count
1824 #else
1825 #define OMPT_LOOP_END // no-op
1826 #endif
1827 
1828 #if KMP_STATS_ENABLED
1829 #define KMP_STATS_LOOP_END \
1830  { \
1831  kmp_int64 u, l, t, i; \
1832  l = (kmp_int64)(*p_lb); \
1833  u = (kmp_int64)(*p_ub); \
1834  i = (kmp_int64)(pr->u.p.st); \
1835  if (status == 0) { \
1836  t = 0; \
1837  KMP_POP_PARTITIONED_TIMER(); \
1838  } else if (i == 1) { \
1839  if (u >= l) \
1840  t = u - l + 1; \
1841  else \
1842  t = 0; \
1843  } else if (i < 0) { \
1844  if (l >= u) \
1845  t = (l - u) / (-i) + 1; \
1846  else \
1847  t = 0; \
1848  } else { \
1849  if (u >= l) \
1850  t = (u - l) / i + 1; \
1851  else \
1852  t = 0; \
1853  } \
1854  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1855  }
1856 #else
1857 #define KMP_STATS_LOOP_END /* Nothing */
1858 #endif
1859 
1860 template <typename T>
1861 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1862  T *p_lb, T *p_ub,
1863  typename traits_t<T>::signed_t *p_st
1864 #if OMPT_SUPPORT && OMPT_OPTIONAL
1865  ,
1866  void *codeptr
1867 #endif
1868  ) {
1869 
1870  typedef typename traits_t<T>::unsigned_t UT;
1871  typedef typename traits_t<T>::signed_t ST;
1872  // This is potentially slightly misleading, schedule(runtime) will appear here
1873  // even if the actual runtme schedule is static. (Which points out a
1874  // disadavantage of schedule(runtime): even when static scheduling is used it
1875  // costs more than a compile time choice to use static scheduling would.)
1876  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1877 
1878  int status;
1879  dispatch_private_info_template<T> *pr;
1880  kmp_info_t *th = __kmp_threads[gtid];
1881  kmp_team_t *team = th->th.th_team;
1882 
1883  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1884  KD_TRACE(
1885  1000,
1886  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1887  gtid, p_lb, p_ub, p_st, p_last));
1888 
1889  if (team->t.t_serialized) {
1890  /* NOTE: serialize this dispatch becase we are not at the active level */
1891  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1892  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1893  KMP_DEBUG_ASSERT(pr);
1894 
1895  if ((status = (pr->u.p.tc != 0)) == 0) {
1896  *p_lb = 0;
1897  *p_ub = 0;
1898  // if ( p_last != NULL )
1899  // *p_last = 0;
1900  if (p_st != NULL)
1901  *p_st = 0;
1902  if (__kmp_env_consistency_check) {
1903  if (pr->pushed_ws != ct_none) {
1904  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1905  }
1906  }
1907  } else if (pr->flags.nomerge) {
1908  kmp_int32 last;
1909  T start;
1910  UT limit, trip, init;
1911  ST incr;
1912  T chunk = pr->u.p.parm1;
1913 
1914  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1915  gtid));
1916 
1917  init = chunk * pr->u.p.count++;
1918  trip = pr->u.p.tc - 1;
1919 
1920  if ((status = (init <= trip)) == 0) {
1921  *p_lb = 0;
1922  *p_ub = 0;
1923  // if ( p_last != NULL )
1924  // *p_last = 0;
1925  if (p_st != NULL)
1926  *p_st = 0;
1927  if (__kmp_env_consistency_check) {
1928  if (pr->pushed_ws != ct_none) {
1929  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1930  }
1931  }
1932  } else {
1933  start = pr->u.p.lb;
1934  limit = chunk + init - 1;
1935  incr = pr->u.p.st;
1936 
1937  if ((last = (limit >= trip)) != 0) {
1938  limit = trip;
1939 #if KMP_OS_WINDOWS
1940  pr->u.p.last_upper = pr->u.p.ub;
1941 #endif /* KMP_OS_WINDOWS */
1942  }
1943  if (p_last != NULL)
1944  *p_last = last;
1945  if (p_st != NULL)
1946  *p_st = incr;
1947  if (incr == 1) {
1948  *p_lb = start + init;
1949  *p_ub = start + limit;
1950  } else {
1951  *p_lb = start + init * incr;
1952  *p_ub = start + limit * incr;
1953  }
1954 
1955  if (pr->flags.ordered) {
1956  pr->u.p.ordered_lower = init;
1957  pr->u.p.ordered_upper = limit;
1958 #ifdef KMP_DEBUG
1959  {
1960  char *buff;
1961  // create format specifiers before the debug output
1962  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1963  "ordered_lower:%%%s ordered_upper:%%%s\n",
1964  traits_t<UT>::spec, traits_t<UT>::spec);
1965  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1966  pr->u.p.ordered_upper));
1967  __kmp_str_free(&buff);
1968  }
1969 #endif
1970  } // if
1971  } // if
1972  } else {
1973  pr->u.p.tc = 0;
1974  *p_lb = pr->u.p.lb;
1975  *p_ub = pr->u.p.ub;
1976 #if KMP_OS_WINDOWS
1977  pr->u.p.last_upper = *p_ub;
1978 #endif /* KMP_OS_WINDOWS */
1979  if (p_last != NULL)
1980  *p_last = TRUE;
1981  if (p_st != NULL)
1982  *p_st = pr->u.p.st;
1983  } // if
1984 #ifdef KMP_DEBUG
1985  {
1986  char *buff;
1987  // create format specifiers before the debug output
1988  buff = __kmp_str_format(
1989  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1990  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1991  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1992  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1993  __kmp_str_free(&buff);
1994  }
1995 #endif
1996 #if INCLUDE_SSC_MARKS
1997  SSC_MARK_DISPATCH_NEXT();
1998 #endif
1999  OMPT_LOOP_END;
2000  KMP_STATS_LOOP_END;
2001  return status;
2002  } else {
2003  kmp_int32 last = 0;
2004  dispatch_shared_info_template<T> volatile *sh;
2005 
2006  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2007  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2008 
2009  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2010  th->th.th_dispatch->th_dispatch_pr_current);
2011  KMP_DEBUG_ASSERT(pr);
2012  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2013  th->th.th_dispatch->th_dispatch_sh_current);
2014  KMP_DEBUG_ASSERT(sh);
2015 
2016 #if KMP_USE_HIER_SCHED
2017  if (pr->flags.use_hier)
2018  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2019  else
2020 #endif // KMP_USE_HIER_SCHED
2021  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2022  p_st, th->th.th_team_nproc,
2023  th->th.th_info.ds.ds_tid);
2024  // status == 0: no more iterations to execute
2025  if (status == 0) {
2026  UT num_done;
2027 
2028  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2029 #ifdef KMP_DEBUG
2030  {
2031  char *buff;
2032  // create format specifiers before the debug output
2033  buff = __kmp_str_format(
2034  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2035  traits_t<UT>::spec);
2036  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2037  __kmp_str_free(&buff);
2038  }
2039 #endif
2040 
2041 #if KMP_USE_HIER_SCHED
2042  pr->flags.use_hier = FALSE;
2043 #endif
2044  if ((ST)num_done == th->th.th_team_nproc - 1) {
2045 #if (KMP_STATIC_STEAL_ENABLED)
2046  if (pr->schedule == kmp_sch_static_steal &&
2047  traits_t<T>::type_size > 4) {
2048  int i;
2049  kmp_info_t **other_threads = team->t.t_threads;
2050  // loop complete, safe to destroy locks used for stealing
2051  for (i = 0; i < th->th.th_team_nproc; ++i) {
2052  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2053  KMP_ASSERT(lck != NULL);
2054  __kmp_destroy_lock(lck);
2055  __kmp_free(lck);
2056  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2057  }
2058  }
2059 #endif
2060  /* NOTE: release this buffer to be reused */
2061 
2062  KMP_MB(); /* Flush all pending memory write invalidates. */
2063 
2064  sh->u.s.num_done = 0;
2065  sh->u.s.iteration = 0;
2066 
2067  /* TODO replace with general release procedure? */
2068  if (pr->flags.ordered) {
2069  sh->u.s.ordered_iteration = 0;
2070  }
2071 
2072  KMP_MB(); /* Flush all pending memory write invalidates. */
2073 
2074  sh->buffer_index += __kmp_dispatch_num_buffers;
2075  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2076  gtid, sh->buffer_index));
2077 
2078  KMP_MB(); /* Flush all pending memory write invalidates. */
2079 
2080  } // if
2081  if (__kmp_env_consistency_check) {
2082  if (pr->pushed_ws != ct_none) {
2083  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2084  }
2085  }
2086 
2087  th->th.th_dispatch->th_deo_fcn = NULL;
2088  th->th.th_dispatch->th_dxo_fcn = NULL;
2089  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2090  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2091  } // if (status == 0)
2092 #if KMP_OS_WINDOWS
2093  else if (last) {
2094  pr->u.p.last_upper = pr->u.p.ub;
2095  }
2096 #endif /* KMP_OS_WINDOWS */
2097  if (p_last != NULL && status != 0)
2098  *p_last = last;
2099  } // if
2100 
2101 #ifdef KMP_DEBUG
2102  {
2103  char *buff;
2104  // create format specifiers before the debug output
2105  buff = __kmp_str_format(
2106  "__kmp_dispatch_next: T#%%d normal case: "
2107  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2108  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2109  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2110  (p_last ? *p_last : 0), status));
2111  __kmp_str_free(&buff);
2112  }
2113 #endif
2114 #if INCLUDE_SSC_MARKS
2115  SSC_MARK_DISPATCH_NEXT();
2116 #endif
2117  OMPT_LOOP_END;
2118  KMP_STATS_LOOP_END;
2119  return status;
2120 }
2121 
2122 template <typename T>
2123 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2124  kmp_int32 *plastiter, T *plower, T *pupper,
2125  typename traits_t<T>::signed_t incr) {
2126  typedef typename traits_t<T>::unsigned_t UT;
2127  kmp_uint32 team_id;
2128  kmp_uint32 nteams;
2129  UT trip_count;
2130  kmp_team_t *team;
2131  kmp_info_t *th;
2132 
2133  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2134  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2135 #ifdef KMP_DEBUG
2136  typedef typename traits_t<T>::signed_t ST;
2137  {
2138  char *buff;
2139  // create format specifiers before the debug output
2140  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2141  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2142  traits_t<T>::spec, traits_t<T>::spec,
2143  traits_t<ST>::spec, traits_t<T>::spec);
2144  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2145  __kmp_str_free(&buff);
2146  }
2147 #endif
2148 
2149  if (__kmp_env_consistency_check) {
2150  if (incr == 0) {
2151  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2152  loc);
2153  }
2154  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2155  // The loop is illegal.
2156  // Some zero-trip loops maintained by compiler, e.g.:
2157  // for(i=10;i<0;++i) // lower >= upper - run-time check
2158  // for(i=0;i>10;--i) // lower <= upper - run-time check
2159  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2160  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2161  // Compiler does not check the following illegal loops:
2162  // for(i=0;i<10;i+=incr) // where incr<0
2163  // for(i=10;i>0;i-=incr) // where incr<0
2164  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2165  }
2166  }
2167  th = __kmp_threads[gtid];
2168  team = th->th.th_team;
2169 #if OMP_40_ENABLED
2170  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2171  nteams = th->th.th_teams_size.nteams;
2172 #endif
2173  team_id = team->t.t_master_tid;
2174  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2175 
2176  // compute global trip count
2177  if (incr == 1) {
2178  trip_count = *pupper - *plower + 1;
2179  } else if (incr == -1) {
2180  trip_count = *plower - *pupper + 1;
2181  } else if (incr > 0) {
2182  // upper-lower can exceed the limit of signed type
2183  trip_count = (UT)(*pupper - *plower) / incr + 1;
2184  } else {
2185  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2186  }
2187 
2188  if (trip_count <= nteams) {
2189  KMP_DEBUG_ASSERT(
2190  __kmp_static == kmp_sch_static_greedy ||
2191  __kmp_static ==
2192  kmp_sch_static_balanced); // Unknown static scheduling type.
2193  // only some teams get single iteration, others get nothing
2194  if (team_id < trip_count) {
2195  *pupper = *plower = *plower + team_id * incr;
2196  } else {
2197  *plower = *pupper + incr; // zero-trip loop
2198  }
2199  if (plastiter != NULL)
2200  *plastiter = (team_id == trip_count - 1);
2201  } else {
2202  if (__kmp_static == kmp_sch_static_balanced) {
2203  UT chunk = trip_count / nteams;
2204  UT extras = trip_count % nteams;
2205  *plower +=
2206  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2207  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2208  if (plastiter != NULL)
2209  *plastiter = (team_id == nteams - 1);
2210  } else {
2211  T chunk_inc_count =
2212  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2213  T upper = *pupper;
2214  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2215  // Unknown static scheduling type.
2216  *plower += team_id * chunk_inc_count;
2217  *pupper = *plower + chunk_inc_count - incr;
2218  // Check/correct bounds if needed
2219  if (incr > 0) {
2220  if (*pupper < *plower)
2221  *pupper = traits_t<T>::max_value;
2222  if (plastiter != NULL)
2223  *plastiter = *plower <= upper && *pupper > upper - incr;
2224  if (*pupper > upper)
2225  *pupper = upper; // tracker C73258
2226  } else {
2227  if (*pupper > *plower)
2228  *pupper = traits_t<T>::min_value;
2229  if (plastiter != NULL)
2230  *plastiter = *plower >= upper && *pupper < upper - incr;
2231  if (*pupper < upper)
2232  *pupper = upper; // tracker C73258
2233  }
2234  }
2235  }
2236 }
2237 
2238 //-----------------------------------------------------------------------------
2239 // Dispatch routines
2240 // Transfer call to template< type T >
2241 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2242 // T lb, T ub, ST st, ST chunk )
2243 extern "C" {
2244 
2261 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2262  enum sched_type schedule, kmp_int32 lb,
2263  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2264  KMP_DEBUG_ASSERT(__kmp_init_serial);
2265 #if OMPT_SUPPORT && OMPT_OPTIONAL
2266  OMPT_STORE_RETURN_ADDRESS(gtid);
2267 #endif
2268  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2269 }
2273 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2274  enum sched_type schedule, kmp_uint32 lb,
2275  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2276  KMP_DEBUG_ASSERT(__kmp_init_serial);
2277 #if OMPT_SUPPORT && OMPT_OPTIONAL
2278  OMPT_STORE_RETURN_ADDRESS(gtid);
2279 #endif
2280  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2281 }
2282 
2286 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2287  enum sched_type schedule, kmp_int64 lb,
2288  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2289  KMP_DEBUG_ASSERT(__kmp_init_serial);
2290 #if OMPT_SUPPORT && OMPT_OPTIONAL
2291  OMPT_STORE_RETURN_ADDRESS(gtid);
2292 #endif
2293  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2294 }
2295 
2299 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2300  enum sched_type schedule, kmp_uint64 lb,
2301  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2302  KMP_DEBUG_ASSERT(__kmp_init_serial);
2303 #if OMPT_SUPPORT && OMPT_OPTIONAL
2304  OMPT_STORE_RETURN_ADDRESS(gtid);
2305 #endif
2306  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2307 }
2308 
2318 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2319  enum sched_type schedule, kmp_int32 *p_last,
2320  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2321  kmp_int32 chunk) {
2322  KMP_DEBUG_ASSERT(__kmp_init_serial);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324  OMPT_STORE_RETURN_ADDRESS(gtid);
2325 #endif
2326  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2327  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2328 }
2329 
2330 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2331  enum sched_type schedule, kmp_int32 *p_last,
2332  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2333  kmp_int32 chunk) {
2334  KMP_DEBUG_ASSERT(__kmp_init_serial);
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL
2336  OMPT_STORE_RETURN_ADDRESS(gtid);
2337 #endif
2338  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2339  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2340 }
2341 
2342 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2343  enum sched_type schedule, kmp_int32 *p_last,
2344  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2345  kmp_int64 chunk) {
2346  KMP_DEBUG_ASSERT(__kmp_init_serial);
2347 #if OMPT_SUPPORT && OMPT_OPTIONAL
2348  OMPT_STORE_RETURN_ADDRESS(gtid);
2349 #endif
2350  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2351  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2352 }
2353 
2354 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2355  enum sched_type schedule, kmp_int32 *p_last,
2356  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2357  kmp_int64 chunk) {
2358  KMP_DEBUG_ASSERT(__kmp_init_serial);
2359 #if OMPT_SUPPORT && OMPT_OPTIONAL
2360  OMPT_STORE_RETURN_ADDRESS(gtid);
2361 #endif
2362  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2363  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2364 }
2365 
2379 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2380  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2381 #if OMPT_SUPPORT && OMPT_OPTIONAL
2382  OMPT_STORE_RETURN_ADDRESS(gtid);
2383 #endif
2384  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2385 #if OMPT_SUPPORT && OMPT_OPTIONAL
2386  ,
2387  OMPT_LOAD_RETURN_ADDRESS(gtid)
2388 #endif
2389  );
2390 }
2391 
2395 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2396  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2397  kmp_int32 *p_st) {
2398 #if OMPT_SUPPORT && OMPT_OPTIONAL
2399  OMPT_STORE_RETURN_ADDRESS(gtid);
2400 #endif
2401  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2402 #if OMPT_SUPPORT && OMPT_OPTIONAL
2403  ,
2404  OMPT_LOAD_RETURN_ADDRESS(gtid)
2405 #endif
2406  );
2407 }
2408 
2412 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2413  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2414 #if OMPT_SUPPORT && OMPT_OPTIONAL
2415  OMPT_STORE_RETURN_ADDRESS(gtid);
2416 #endif
2417  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419  ,
2420  OMPT_LOAD_RETURN_ADDRESS(gtid)
2421 #endif
2422  );
2423 }
2424 
2428 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2429  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2430  kmp_int64 *p_st) {
2431 #if OMPT_SUPPORT && OMPT_OPTIONAL
2432  OMPT_STORE_RETURN_ADDRESS(gtid);
2433 #endif
2434  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436  ,
2437  OMPT_LOAD_RETURN_ADDRESS(gtid)
2438 #endif
2439  );
2440 }
2441 
2448 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2449  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2450 }
2451 
2455 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2456  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2457 }
2458 
2462 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2463  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2464 }
2465 
2469 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2470  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2471 }
2474 //-----------------------------------------------------------------------------
2475 // Non-template routines from kmp_dispatch.cpp used in other sources
2476 
2477 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2478  return value == checker;
2479 }
2480 
2481 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2482  return value != checker;
2483 }
2484 
2485 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2486  return value < checker;
2487 }
2488 
2489 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2490  return value >= checker;
2491 }
2492 
2493 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2494  return value <= checker;
2495 }
2496 
2497 kmp_uint32
2498 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2499  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2500  void *obj // Higher-level synchronization object, or NULL.
2501  ) {
2502  // note: we may not belong to a team at this point
2503  volatile kmp_uint32 *spin = spinner;
2504  kmp_uint32 check = checker;
2505  kmp_uint32 spins;
2506  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2507  kmp_uint32 r;
2508 
2509  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2510  KMP_INIT_YIELD(spins);
2511  // main wait spin loop
2512  while (!f(r = TCR_4(*spin), check)) {
2513  KMP_FSYNC_SPIN_PREPARE(obj);
2514  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2515  split. It causes problems with infinite recursion because of exit lock */
2516  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2517  __kmp_abort_thread(); */
2518  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2519  }
2520  KMP_FSYNC_SPIN_ACQUIRED(obj);
2521  return r;
2522 }
2523 
2524 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2525  kmp_uint32 (*pred)(void *, kmp_uint32),
2526  void *obj // Higher-level synchronization object, or NULL.
2527  ) {
2528  // note: we may not belong to a team at this point
2529  void *spin = spinner;
2530  kmp_uint32 check = checker;
2531  kmp_uint32 spins;
2532  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2533 
2534  KMP_FSYNC_SPIN_INIT(obj, spin);
2535  KMP_INIT_YIELD(spins);
2536  // main wait spin loop
2537  while (!f(spin, check)) {
2538  KMP_FSYNC_SPIN_PREPARE(obj);
2539  /* if we have waited a bit, or are noversubscribed, yield */
2540  /* pause is in the following code */
2541  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2542  }
2543  KMP_FSYNC_SPIN_ACQUIRED(obj);
2544 }
2545 
2546 } // extern "C"
2547 
2548 #ifdef KMP_GOMP_COMPAT
2549 
2550 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2551  enum sched_type schedule, kmp_int32 lb,
2552  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2553  int push_ws) {
2554  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2555  push_ws);
2556 }
2557 
2558 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2559  enum sched_type schedule, kmp_uint32 lb,
2560  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2561  int push_ws) {
2562  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2563  push_ws);
2564 }
2565 
2566 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2567  enum sched_type schedule, kmp_int64 lb,
2568  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2569  int push_ws) {
2570  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2571  push_ws);
2572 }
2573 
2574 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2575  enum sched_type schedule, kmp_uint64 lb,
2576  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2577  int push_ws) {
2578  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2579  push_ws);
2580 }
2581 
2582 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2583  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2584 }
2585 
2586 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2587  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2588 }
2589 
2590 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2591  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2592 }
2593 
2594 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2595  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2596 }
2597 
2598 #endif /* KMP_GOMP_COMPAT */
2599 
2600 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:900
sched_type
Definition: kmp.h:336
Definition: kmp.h:223
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)