LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #if OMPT_SUPPORT
19 #include "ompt-specific.h"
20 #endif
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #include "tsan_annotations.h"
28 
29 #if KMP_MIC && USE_NGO_STORES
30 // ICV copying
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
35 #else
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
40 #endif /* KMP_MIC && USE_NGO_STORES */
41 
42 void __kmp_print_structure(void); // Forward declaration
43 
44 // ---------------------------- Barrier Algorithms ----------------------------
45 
46 // Linear Barrier
47 template <bool cancellable = false>
48 static bool __kmp_linear_barrier_gather_template(
49  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52  kmp_team_t *team = this_thr->th.th_team;
53  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54  kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(
57  20,
58  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66  __itt_get_timestamp();
67  }
68 #endif
69  // We now perform a linear reduction to signal that all of the threads have
70  // arrived.
71  if (!KMP_MASTER_TID(tid)) {
72  KA_TRACE(20,
73  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74  "arrived(%p): %llu => %llu\n",
75  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78  // Mark arrival to master thread
79  /* After performing this write, a worker thread may not assume that the team
80  is valid any more - it could be deallocated by the master thread at any
81  time. */
82  ANNOTATE_BARRIER_BEGIN(this_thr);
83  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84  flag.release();
85  } else {
86  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87  int nproc = this_thr->th.th_team_nproc;
88  int i;
89  // Don't have to worry about sleep bit here or atomic since team setting
90  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 
92  // Collect all the worker team member threads.
93  for (i = 1; i < nproc; ++i) {
94 #if KMP_CACHE_MANAGE
95  // Prefetch next thread's arrived count
96  if (i + 1 < nproc)
97  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100  "arrived(%p) == %llu\n",
101  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
102  team->t.t_id, i,
103  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 
105  // Wait for worker thread to arrive
106  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107  new_state);
108  if (cancellable) {
109  bool cancelled = flag.wait_cancellable_nosleep(
110  this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
111  if (cancelled)
112  return true;
113  } else {
114  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
115  }
116  ANNOTATE_BARRIER_END(other_threads[i]);
117 #if USE_ITT_BUILD && USE_ITT_NOTIFY
118  // Barrier imbalance - write min of the thread time and the other thread
119  // time to the thread.
120  if (__kmp_forkjoin_frames_mode == 2) {
121  this_thr->th.th_bar_min_time = KMP_MIN(
122  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
123  }
124 #endif
125  if (reduce) {
126  KA_TRACE(100,
127  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
128  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
129  team->t.t_id, i));
130  ANNOTATE_REDUCE_AFTER(reduce);
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  ANNOTATE_REDUCE_BEFORE(reduce);
134  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
135  }
136  }
137  // Don't have to worry about sleep bit here or atomic since team setting
138  team_bar->b_arrived = new_state;
139  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
140  "arrived(%p) = %llu\n",
141  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
142  new_state));
143  }
144  KA_TRACE(
145  20,
146  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
147  gtid, team->t.t_id, tid, bt));
148  return false;
149 }
150 
151 template <bool cancellable = false>
152 static bool __kmp_linear_barrier_release_template(
153  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
154  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
155  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
156  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
157  kmp_team_t *team;
158 
159  if (KMP_MASTER_TID(tid)) {
160  unsigned int i;
161  kmp_uint32 nproc = this_thr->th.th_team_nproc;
162  kmp_info_t **other_threads;
163 
164  team = __kmp_threads[gtid]->th.th_team;
165  KMP_DEBUG_ASSERT(team != NULL);
166  other_threads = team->t.t_threads;
167 
168  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
169  "barrier type %d\n",
170  gtid, team->t.t_id, tid, bt));
171 
172  if (nproc > 1) {
173 #if KMP_BARRIER_ICV_PUSH
174  {
175  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
176  if (propagate_icvs) {
177  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
178  for (i = 1; i < nproc; ++i) {
179  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
180  team, i, FALSE);
181  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
182  &team->t.t_implicit_task_taskdata[0].td_icvs);
183  }
184  ngo_sync();
185  }
186  }
187 #endif // KMP_BARRIER_ICV_PUSH
188 
189  // Now, release all of the worker threads
190  for (i = 1; i < nproc; ++i) {
191 #if KMP_CACHE_MANAGE
192  // Prefetch next thread's go flag
193  if (i + 1 < nproc)
194  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
195 #endif /* KMP_CACHE_MANAGE */
196  KA_TRACE(
197  20,
198  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
199  "go(%p): %u => %u\n",
200  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
201  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
202  other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
204  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
205  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
206  other_threads[i]);
207  flag.release();
208  }
209  }
210  } else { // Wait for the MASTER thread to release us
211  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
212  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
213  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
214  if (cancellable) {
215  bool cancelled = flag.wait_cancellable_nosleep(
216  this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
217  if (cancelled) {
218  return true;
219  }
220  } else {
221  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
222  }
223  ANNOTATE_BARRIER_END(this_thr);
224 #if USE_ITT_BUILD && USE_ITT_NOTIFY
225  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
226  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
227  // disabled)
228  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
229  // Cancel wait on previous parallel region...
230  __kmp_itt_task_starting(itt_sync_obj);
231 
232  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
233  return false;
234 
235  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
236  if (itt_sync_obj != NULL)
237  // Call prepare as early as possible for "new" barrier
238  __kmp_itt_task_finished(itt_sync_obj);
239  } else
240 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
241  // Early exit for reaping threads releasing forkjoin barrier
242  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
243  return false;
244 // The worker thread may now assume that the team is valid.
245 #ifdef KMP_DEBUG
246  tid = __kmp_tid_from_gtid(gtid);
247  team = __kmp_threads[gtid]->th.th_team;
248 #endif
249  KMP_DEBUG_ASSERT(team != NULL);
250  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
251  KA_TRACE(20,
252  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
253  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
254  KMP_MB(); // Flush all pending memory write invalidates.
255  }
256  KA_TRACE(
257  20,
258  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
259  gtid, team->t.t_id, tid, bt));
260  return false;
261 }
262 
263 static void __kmp_linear_barrier_gather(
264  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
265  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
266  __kmp_linear_barrier_gather_template<false>(
267  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
268 }
269 
270 static bool __kmp_linear_barrier_gather_cancellable(
271  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
272  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
273  return __kmp_linear_barrier_gather_template<true>(
274  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
275 }
276 
277 static void __kmp_linear_barrier_release(
278  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
279  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
280  __kmp_linear_barrier_release_template<false>(
281  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
282 }
283 
284 static bool __kmp_linear_barrier_release_cancellable(
285  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
286  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
287  return __kmp_linear_barrier_release_template<true>(
288  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
289 }
290 
291 // Tree barrier
292 static void
293 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
294  int tid, void (*reduce)(void *, void *)
295  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
296  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
297  kmp_team_t *team = this_thr->th.th_team;
298  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
299  kmp_info_t **other_threads = team->t.t_threads;
300  kmp_uint32 nproc = this_thr->th.th_team_nproc;
301  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
302  kmp_uint32 branch_factor = 1 << branch_bits;
303  kmp_uint32 child;
304  kmp_uint32 child_tid;
305  kmp_uint64 new_state;
306 
307  KA_TRACE(
308  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
309  gtid, team->t.t_id, tid, bt));
310  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
311 
312 #if USE_ITT_BUILD && USE_ITT_NOTIFY
313  // Barrier imbalance - save arrive time to the thread
314  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
315  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
316  __itt_get_timestamp();
317  }
318 #endif
319  // Perform tree gather to wait until all threads have arrived; reduce any
320  // required data as we go
321  child_tid = (tid << branch_bits) + 1;
322  if (child_tid < nproc) {
323  // Parent threads wait for all their children to arrive
324  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
325  child = 1;
326  do {
327  kmp_info_t *child_thr = other_threads[child_tid];
328  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
329 #if KMP_CACHE_MANAGE
330  // Prefetch next thread's arrived count
331  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
332  KMP_CACHE_PREFETCH(
333  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
334 #endif /* KMP_CACHE_MANAGE */
335  KA_TRACE(20,
336  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
337  "arrived(%p) == %llu\n",
338  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
339  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
340  // Wait for child to arrive
341  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
342  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
343  ANNOTATE_BARRIER_END(child_thr);
344 #if USE_ITT_BUILD && USE_ITT_NOTIFY
345  // Barrier imbalance - write min of the thread time and a child time to
346  // the thread.
347  if (__kmp_forkjoin_frames_mode == 2) {
348  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
349  child_thr->th.th_bar_min_time);
350  }
351 #endif
352  if (reduce) {
353  KA_TRACE(100,
354  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
355  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
356  team->t.t_id, child_tid));
357  ANNOTATE_REDUCE_AFTER(reduce);
358  (*reduce)(this_thr->th.th_local.reduce_data,
359  child_thr->th.th_local.reduce_data);
360  ANNOTATE_REDUCE_BEFORE(reduce);
361  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
362  }
363  child++;
364  child_tid++;
365  } while (child <= branch_factor && child_tid < nproc);
366  }
367 
368  if (!KMP_MASTER_TID(tid)) { // Worker threads
369  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
370 
371  KA_TRACE(20,
372  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
373  "arrived(%p): %llu => %llu\n",
374  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
375  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
376  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
377 
378  // Mark arrival to parent thread
379  /* After performing this write, a worker thread may not assume that the team
380  is valid any more - it could be deallocated by the master thread at any
381  time. */
382  ANNOTATE_BARRIER_BEGIN(this_thr);
383  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
384  flag.release();
385  } else {
386  // Need to update the team arrived pointer if we are the master thread
387  if (nproc > 1) // New value was already computed above
388  team->t.t_bar[bt].b_arrived = new_state;
389  else
390  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
391  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
392  "arrived(%p) = %llu\n",
393  gtid, team->t.t_id, tid, team->t.t_id,
394  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
395  }
396  KA_TRACE(20,
397  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
398  gtid, team->t.t_id, tid, bt));
399 }
400 
401 static void __kmp_tree_barrier_release(
402  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
403  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
404  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
405  kmp_team_t *team;
406  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
407  kmp_uint32 nproc;
408  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
409  kmp_uint32 branch_factor = 1 << branch_bits;
410  kmp_uint32 child;
411  kmp_uint32 child_tid;
412 
413  // Perform a tree release for all of the threads that have been gathered
414  if (!KMP_MASTER_TID(
415  tid)) { // Handle fork barrier workers who aren't part of a team yet
416  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
417  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
418  // Wait for parent thread to release us
419  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
420  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
421  ANNOTATE_BARRIER_END(this_thr);
422 #if USE_ITT_BUILD && USE_ITT_NOTIFY
423  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
424  // In fork barrier where we could not get the object reliably (or
425  // ITTNOTIFY is disabled)
426  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
427  // Cancel wait on previous parallel region...
428  __kmp_itt_task_starting(itt_sync_obj);
429 
430  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
431  return;
432 
433  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
434  if (itt_sync_obj != NULL)
435  // Call prepare as early as possible for "new" barrier
436  __kmp_itt_task_finished(itt_sync_obj);
437  } else
438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
439  // Early exit for reaping threads releasing forkjoin barrier
440  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
441  return;
442 
443  // The worker thread may now assume that the team is valid.
444  team = __kmp_threads[gtid]->th.th_team;
445  KMP_DEBUG_ASSERT(team != NULL);
446  tid = __kmp_tid_from_gtid(gtid);
447 
448  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
449  KA_TRACE(20,
450  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
451  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
452  KMP_MB(); // Flush all pending memory write invalidates.
453  } else {
454  team = __kmp_threads[gtid]->th.th_team;
455  KMP_DEBUG_ASSERT(team != NULL);
456  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
457  "barrier type %d\n",
458  gtid, team->t.t_id, tid, bt));
459  }
460  nproc = this_thr->th.th_team_nproc;
461  child_tid = (tid << branch_bits) + 1;
462 
463  if (child_tid < nproc) {
464  kmp_info_t **other_threads = team->t.t_threads;
465  child = 1;
466  // Parent threads release all their children
467  do {
468  kmp_info_t *child_thr = other_threads[child_tid];
469  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
470 #if KMP_CACHE_MANAGE
471  // Prefetch next thread's go count
472  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
473  KMP_CACHE_PREFETCH(
474  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
475 #endif /* KMP_CACHE_MANAGE */
476 
477 #if KMP_BARRIER_ICV_PUSH
478  {
479  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
480  if (propagate_icvs) {
481  __kmp_init_implicit_task(team->t.t_ident,
482  team->t.t_threads[child_tid], team,
483  child_tid, FALSE);
484  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
485  &team->t.t_implicit_task_taskdata[0].td_icvs);
486  }
487  }
488 #endif // KMP_BARRIER_ICV_PUSH
489  KA_TRACE(20,
490  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
491  "go(%p): %u => %u\n",
492  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
493  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
494  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
495  // Release child from barrier
496  ANNOTATE_BARRIER_BEGIN(child_thr);
497  kmp_flag_64 flag(&child_bar->b_go, child_thr);
498  flag.release();
499  child++;
500  child_tid++;
501  } while (child <= branch_factor && child_tid < nproc);
502  }
503  KA_TRACE(
504  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
505  gtid, team->t.t_id, tid, bt));
506 }
507 
508 // Hyper Barrier
509 static void
510 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
511  int tid, void (*reduce)(void *, void *)
512  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
513  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514  kmp_team_t *team = this_thr->th.th_team;
515  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516  kmp_info_t **other_threads = team->t.t_threads;
517  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520  kmp_uint32 branch_factor = 1 << branch_bits;
521  kmp_uint32 offset;
522  kmp_uint32 level;
523 
524  KA_TRACE(
525  20,
526  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527  gtid, team->t.t_id, tid, bt));
528  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
529 
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
531  // Barrier imbalance - save arrive time to the thread
532  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534  __itt_get_timestamp();
535  }
536 #endif
537  /* Perform a hypercube-embedded tree gather to wait until all of the threads
538  have arrived, and reduce any required data as we go. */
539  kmp_flag_64 p_flag(&thr_bar->b_arrived);
540  for (level = 0, offset = 1; offset < num_threads;
541  level += branch_bits, offset <<= branch_bits) {
542  kmp_uint32 child;
543  kmp_uint32 child_tid;
544 
545  if (((tid >> level) & (branch_factor - 1)) != 0) {
546  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
547 
548  KA_TRACE(20,
549  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
550  "arrived(%p): %llu => %llu\n",
551  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
552  team->t.t_id, parent_tid, &thr_bar->b_arrived,
553  thr_bar->b_arrived,
554  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
555  // Mark arrival to parent thread
556  /* After performing this write (in the last iteration of the enclosing for
557  loop), a worker thread may not assume that the team is valid any more
558  - it could be deallocated by the master thread at any time. */
559  ANNOTATE_BARRIER_BEGIN(this_thr);
560  p_flag.set_waiter(other_threads[parent_tid]);
561  p_flag.release();
562  break;
563  }
564 
565  // Parent threads wait for children to arrive
566  if (new_state == KMP_BARRIER_UNUSED_STATE)
567  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
568  for (child = 1, child_tid = tid + (1 << level);
569  child < branch_factor && child_tid < num_threads;
570  child++, child_tid += (1 << level)) {
571  kmp_info_t *child_thr = other_threads[child_tid];
572  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
573 #if KMP_CACHE_MANAGE
574  kmp_uint32 next_child_tid = child_tid + (1 << level);
575  // Prefetch next thread's arrived count
576  if (child + 1 < branch_factor && next_child_tid < num_threads)
577  KMP_CACHE_PREFETCH(
578  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
579 #endif /* KMP_CACHE_MANAGE */
580  KA_TRACE(20,
581  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
582  "arrived(%p) == %llu\n",
583  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
584  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
585  // Wait for child to arrive
586  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
587  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
588  ANNOTATE_BARRIER_END(child_thr);
589 #if USE_ITT_BUILD && USE_ITT_NOTIFY
590  // Barrier imbalance - write min of the thread time and a child time to
591  // the thread.
592  if (__kmp_forkjoin_frames_mode == 2) {
593  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
594  child_thr->th.th_bar_min_time);
595  }
596 #endif
597  if (reduce) {
598  KA_TRACE(100,
599  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
600  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
601  team->t.t_id, child_tid));
602  ANNOTATE_REDUCE_AFTER(reduce);
603  (*reduce)(this_thr->th.th_local.reduce_data,
604  child_thr->th.th_local.reduce_data);
605  ANNOTATE_REDUCE_BEFORE(reduce);
606  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
607  }
608  }
609  }
610 
611  if (KMP_MASTER_TID(tid)) {
612  // Need to update the team arrived pointer if we are the master thread
613  if (new_state == KMP_BARRIER_UNUSED_STATE)
614  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
615  else
616  team->t.t_bar[bt].b_arrived = new_state;
617  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
618  "arrived(%p) = %llu\n",
619  gtid, team->t.t_id, tid, team->t.t_id,
620  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
621  }
622  KA_TRACE(
623  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
624  gtid, team->t.t_id, tid, bt));
625 }
626 
627 // The reverse versions seem to beat the forward versions overall
628 #define KMP_REVERSE_HYPER_BAR
629 static void __kmp_hyper_barrier_release(
630  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
631  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
632  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
633  kmp_team_t *team;
634  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
635  kmp_info_t **other_threads;
636  kmp_uint32 num_threads;
637  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
638  kmp_uint32 branch_factor = 1 << branch_bits;
639  kmp_uint32 child;
640  kmp_uint32 child_tid;
641  kmp_uint32 offset;
642  kmp_uint32 level;
643 
644  /* Perform a hypercube-embedded tree release for all of the threads that have
645  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
646  are released in the reverse order of the corresponding gather, otherwise
647  threads are released in the same order. */
648  if (KMP_MASTER_TID(tid)) { // master
649  team = __kmp_threads[gtid]->th.th_team;
650  KMP_DEBUG_ASSERT(team != NULL);
651  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
652  "barrier type %d\n",
653  gtid, team->t.t_id, tid, bt));
654 #if KMP_BARRIER_ICV_PUSH
655  if (propagate_icvs) { // master already has ICVs in final destination; copy
656  copy_icvs(&thr_bar->th_fixed_icvs,
657  &team->t.t_implicit_task_taskdata[tid].td_icvs);
658  }
659 #endif
660  } else { // Handle fork barrier workers who aren't part of a team yet
661  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
662  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
663  // Wait for parent thread to release us
664  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
665  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
666  ANNOTATE_BARRIER_END(this_thr);
667 #if USE_ITT_BUILD && USE_ITT_NOTIFY
668  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
669  // In fork barrier where we could not get the object reliably
670  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
671  // Cancel wait on previous parallel region...
672  __kmp_itt_task_starting(itt_sync_obj);
673 
674  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
675  return;
676 
677  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
678  if (itt_sync_obj != NULL)
679  // Call prepare as early as possible for "new" barrier
680  __kmp_itt_task_finished(itt_sync_obj);
681  } else
682 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
683  // Early exit for reaping threads releasing forkjoin barrier
684  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
685  return;
686 
687  // The worker thread may now assume that the team is valid.
688  team = __kmp_threads[gtid]->th.th_team;
689  KMP_DEBUG_ASSERT(team != NULL);
690  tid = __kmp_tid_from_gtid(gtid);
691 
692  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
693  KA_TRACE(20,
694  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
695  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
696  KMP_MB(); // Flush all pending memory write invalidates.
697  }
698  num_threads = this_thr->th.th_team_nproc;
699  other_threads = team->t.t_threads;
700 
701 #ifdef KMP_REVERSE_HYPER_BAR
702  // Count up to correct level for parent
703  for (level = 0, offset = 1;
704  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
705  level += branch_bits, offset <<= branch_bits)
706  ;
707 
708  // Now go down from there
709  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
710  level -= branch_bits, offset >>= branch_bits)
711 #else
712  // Go down the tree, level by level
713  for (level = 0, offset = 1; offset < num_threads;
714  level += branch_bits, offset <<= branch_bits)
715 #endif // KMP_REVERSE_HYPER_BAR
716  {
717 #ifdef KMP_REVERSE_HYPER_BAR
718  /* Now go in reverse order through the children, highest to lowest.
719  Initial setting of child is conservative here. */
720  child = num_threads >> ((level == 0) ? level : level - 1);
721  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
722  child_tid = tid + (child << level);
723  child >= 1; child--, child_tid -= (1 << level))
724 #else
725  if (((tid >> level) & (branch_factor - 1)) != 0)
726  // No need to go lower than this, since this is the level parent would be
727  // notified
728  break;
729  // Iterate through children on this level of the tree
730  for (child = 1, child_tid = tid + (1 << level);
731  child < branch_factor && child_tid < num_threads;
732  child++, child_tid += (1 << level))
733 #endif // KMP_REVERSE_HYPER_BAR
734  {
735  if (child_tid >= num_threads)
736  continue; // Child doesn't exist so keep going
737  else {
738  kmp_info_t *child_thr = other_threads[child_tid];
739  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
740 #if KMP_CACHE_MANAGE
741  kmp_uint32 next_child_tid = child_tid - (1 << level);
742 // Prefetch next thread's go count
743 #ifdef KMP_REVERSE_HYPER_BAR
744  if (child - 1 >= 1 && next_child_tid < num_threads)
745 #else
746  if (child + 1 < branch_factor && next_child_tid < num_threads)
747 #endif // KMP_REVERSE_HYPER_BAR
748  KMP_CACHE_PREFETCH(
749  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
750 #endif /* KMP_CACHE_MANAGE */
751 
752 #if KMP_BARRIER_ICV_PUSH
753  if (propagate_icvs) // push my fixed ICVs to my child
754  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
755 #endif // KMP_BARRIER_ICV_PUSH
756 
757  KA_TRACE(
758  20,
759  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
760  "go(%p): %u => %u\n",
761  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
762  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
763  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
764  // Release child from barrier
765  ANNOTATE_BARRIER_BEGIN(child_thr);
766  kmp_flag_64 flag(&child_bar->b_go, child_thr);
767  flag.release();
768  }
769  }
770  }
771 #if KMP_BARRIER_ICV_PUSH
772  if (propagate_icvs &&
773  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
774  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
775  FALSE);
776  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
777  &thr_bar->th_fixed_icvs);
778  }
779 #endif
780  KA_TRACE(
781  20,
782  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
783  gtid, team->t.t_id, tid, bt));
784 }
785 
786 // Hierarchical Barrier
787 
788 // Initialize thread barrier data
789 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
790  Performs the minimum amount of initialization required based on how the team
791  has changed. Returns true if leaf children will require both on-core and
792  traditional wake-up mechanisms. For example, if the team size increases,
793  threads already in the team will respond to on-core wakeup on their parent
794  thread, but threads newly added to the team will only be listening on the
795  their local b_go. */
796 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
797  kmp_bstate_t *thr_bar,
798  kmp_uint32 nproc, int gtid,
799  int tid, kmp_team_t *team) {
800  // Checks to determine if (re-)initialization is needed
801  bool uninitialized = thr_bar->team == NULL;
802  bool team_changed = team != thr_bar->team;
803  bool team_sz_changed = nproc != thr_bar->nproc;
804  bool tid_changed = tid != thr_bar->old_tid;
805  bool retval = false;
806 
807  if (uninitialized || team_sz_changed) {
808  __kmp_get_hierarchy(nproc, thr_bar);
809  }
810 
811  if (uninitialized || team_sz_changed || tid_changed) {
812  thr_bar->my_level = thr_bar->depth - 1; // default for master
813  thr_bar->parent_tid = -1; // default for master
814  if (!KMP_MASTER_TID(
815  tid)) { // if not master, find parent thread in hierarchy
816  kmp_uint32 d = 0;
817  while (d < thr_bar->depth) { // find parent based on level of thread in
818  // hierarchy, and note level
819  kmp_uint32 rem;
820  if (d == thr_bar->depth - 2) { // reached level right below the master
821  thr_bar->parent_tid = 0;
822  thr_bar->my_level = d;
823  break;
824  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
825  0) { // TODO: can we make this op faster?
826  // thread is not a subtree root at next level, so this is max
827  thr_bar->parent_tid = tid - rem;
828  thr_bar->my_level = d;
829  break;
830  }
831  ++d;
832  }
833  }
834  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
835  thr_bar->old_tid = tid;
836  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
837  thr_bar->team = team;
838  thr_bar->parent_bar =
839  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
840  }
841  if (uninitialized || team_changed || tid_changed) {
842  thr_bar->team = team;
843  thr_bar->parent_bar =
844  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
845  retval = true;
846  }
847  if (uninitialized || team_sz_changed || tid_changed) {
848  thr_bar->nproc = nproc;
849  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
850  if (thr_bar->my_level == 0)
851  thr_bar->leaf_kids = 0;
852  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
853  thr_bar->leaf_kids = nproc - tid - 1;
854  thr_bar->leaf_state = 0;
855  for (int i = 0; i < thr_bar->leaf_kids; ++i)
856  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
857  }
858  return retval;
859 }
860 
861 static void __kmp_hierarchical_barrier_gather(
862  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
864  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
865  kmp_team_t *team = this_thr->th.th_team;
866  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
867  kmp_uint32 nproc = this_thr->th.th_team_nproc;
868  kmp_info_t **other_threads = team->t.t_threads;
869  kmp_uint64 new_state;
870 
871  int level = team->t.t_level;
872 #if OMP_40_ENABLED
873  if (other_threads[0]
874  ->th.th_teams_microtask) // are we inside the teams construct?
875  if (this_thr->th.th_teams_size.nteams > 1)
876  ++level; // level was not increased in teams construct for team_of_masters
877 #endif
878  if (level == 1)
879  thr_bar->use_oncore_barrier = 1;
880  else
881  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
882 
883  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
884  "barrier type %d\n",
885  gtid, team->t.t_id, tid, bt));
886  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
887 
888 #if USE_ITT_BUILD && USE_ITT_NOTIFY
889  // Barrier imbalance - save arrive time to the thread
890  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
891  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
892  }
893 #endif
894 
895  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
896  team);
897 
898  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
899  kmp_int32 child_tid;
900  new_state =
901  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
902  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
903  thr_bar->use_oncore_barrier) {
904  if (thr_bar->leaf_kids) {
905  // First, wait for leaf children to check-in on my b_arrived flag
906  kmp_uint64 leaf_state =
907  KMP_MASTER_TID(tid)
908  ? thr_bar->b_arrived | thr_bar->leaf_state
909  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
910  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
911  "for leaf kids\n",
912  gtid, team->t.t_id, tid));
913  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
914  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
915  if (reduce) {
916  ANNOTATE_REDUCE_AFTER(reduce);
917  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
918  ++child_tid) {
919  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
920  "T#%d(%d:%d)\n",
921  gtid, team->t.t_id, tid,
922  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
923  child_tid));
924  ANNOTATE_BARRIER_END(other_threads[child_tid]);
925  (*reduce)(this_thr->th.th_local.reduce_data,
926  other_threads[child_tid]->th.th_local.reduce_data);
927  }
928  ANNOTATE_REDUCE_BEFORE(reduce);
929  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
930  }
931  // clear leaf_state bits
932  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
933  }
934  // Next, wait for higher level children on each child's b_arrived flag
935  for (kmp_uint32 d = 1; d < thr_bar->my_level;
936  ++d) { // gather lowest level threads first, but skip 0
937  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
938  skip = thr_bar->skip_per_level[d];
939  if (last > nproc)
940  last = nproc;
941  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
942  kmp_info_t *child_thr = other_threads[child_tid];
943  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
944  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
945  "T#%d(%d:%d) "
946  "arrived(%p) == %llu\n",
947  gtid, team->t.t_id, tid,
948  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
949  child_tid, &child_bar->b_arrived, new_state));
950  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
951  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
952  ANNOTATE_BARRIER_END(child_thr);
953  if (reduce) {
954  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
955  "T#%d(%d:%d)\n",
956  gtid, team->t.t_id, tid,
957  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
958  child_tid));
959  ANNOTATE_REDUCE_AFTER(reduce);
960  (*reduce)(this_thr->th.th_local.reduce_data,
961  child_thr->th.th_local.reduce_data);
962  ANNOTATE_REDUCE_BEFORE(reduce);
963  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
964  }
965  }
966  }
967  } else { // Blocktime is not infinite
968  for (kmp_uint32 d = 0; d < thr_bar->my_level;
969  ++d) { // Gather lowest level threads first
970  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
971  skip = thr_bar->skip_per_level[d];
972  if (last > nproc)
973  last = nproc;
974  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
975  kmp_info_t *child_thr = other_threads[child_tid];
976  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
977  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
978  "T#%d(%d:%d) "
979  "arrived(%p) == %llu\n",
980  gtid, team->t.t_id, tid,
981  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
982  child_tid, &child_bar->b_arrived, new_state));
983  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
984  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
985  ANNOTATE_BARRIER_END(child_thr);
986  if (reduce) {
987  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
988  "T#%d(%d:%d)\n",
989  gtid, team->t.t_id, tid,
990  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
991  child_tid));
992  ANNOTATE_REDUCE_AFTER(reduce);
993  (*reduce)(this_thr->th.th_local.reduce_data,
994  child_thr->th.th_local.reduce_data);
995  ANNOTATE_REDUCE_BEFORE(reduce);
996  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
997  }
998  }
999  }
1000  }
1001  }
1002  // All subordinates are gathered; now release parent if not master thread
1003 
1004  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1005  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1006  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1007  gtid, team->t.t_id, tid,
1008  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1009  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1010  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1011  /* Mark arrival to parent: After performing this write, a worker thread may
1012  not assume that the team is valid any more - it could be deallocated by
1013  the master thread at any time. */
1014  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1015  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1016  // flag; release it
1017  ANNOTATE_BARRIER_BEGIN(this_thr);
1018  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1019  flag.release();
1020  } else {
1021  // Leaf does special release on "offset" bits of parent's b_arrived flag
1022  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1023  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1024  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1025  flag.release();
1026  }
1027  } else { // Master thread needs to update the team's b_arrived value
1028  team->t.t_bar[bt].b_arrived = new_state;
1029  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1030  "arrived(%p) = %llu\n",
1031  gtid, team->t.t_id, tid, team->t.t_id,
1032  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1033  }
1034  // Is the team access below unsafe or just technically invalid?
1035  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1036  "barrier type %d\n",
1037  gtid, team->t.t_id, tid, bt));
1038 }
1039 
1040 static void __kmp_hierarchical_barrier_release(
1041  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1042  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1043  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1044  kmp_team_t *team;
1045  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1046  kmp_uint32 nproc;
1047  bool team_change = false; // indicates on-core barrier shouldn't be used
1048 
1049  if (KMP_MASTER_TID(tid)) {
1050  team = __kmp_threads[gtid]->th.th_team;
1051  KMP_DEBUG_ASSERT(team != NULL);
1052  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1053  "entered barrier type %d\n",
1054  gtid, team->t.t_id, tid, bt));
1055  } else { // Worker threads
1056  // Wait for parent thread to release me
1057  if (!thr_bar->use_oncore_barrier ||
1058  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1059  thr_bar->team == NULL) {
1060  // Use traditional method of waiting on my own b_go flag
1061  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1062  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1063  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1064  ANNOTATE_BARRIER_END(this_thr);
1065  TCW_8(thr_bar->b_go,
1066  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1067  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1068  // infinite, not nested
1069  // Wait on my "offset" bits on parent's b_go flag
1070  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1071  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1072  thr_bar->offset, bt,
1073  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1074  flag.wait(this_thr, TRUE);
1075  if (thr_bar->wait_flag ==
1076  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1077  TCW_8(thr_bar->b_go,
1078  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1079  } else { // Reset my bits on parent's b_go flag
1080  (RCAST(volatile char *,
1081  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1082  }
1083  }
1084  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1085  // Early exit for reaping threads releasing forkjoin barrier
1086  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1087  return;
1088  // The worker thread may now assume that the team is valid.
1089  team = __kmp_threads[gtid]->th.th_team;
1090  KMP_DEBUG_ASSERT(team != NULL);
1091  tid = __kmp_tid_from_gtid(gtid);
1092 
1093  KA_TRACE(
1094  20,
1095  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1096  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1097  KMP_MB(); // Flush all pending memory write invalidates.
1098  }
1099 
1100  nproc = this_thr->th.th_team_nproc;
1101  int level = team->t.t_level;
1102 #if OMP_40_ENABLED
1103  if (team->t.t_threads[0]
1104  ->th.th_teams_microtask) { // are we inside the teams construct?
1105  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1106  this_thr->th.th_teams_level == level)
1107  ++level; // level was not increased in teams construct for team_of_workers
1108  if (this_thr->th.th_teams_size.nteams > 1)
1109  ++level; // level was not increased in teams construct for team_of_masters
1110  }
1111 #endif
1112  if (level == 1)
1113  thr_bar->use_oncore_barrier = 1;
1114  else
1115  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1116 
1117  // If the team size has increased, we still communicate with old leaves via
1118  // oncore barrier.
1119  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1120  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1121  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1122  tid, team);
1123  // But if the entire team changes, we won't use oncore barrier at all
1124  if (team_change)
1125  old_leaf_kids = 0;
1126 
1127 #if KMP_BARRIER_ICV_PUSH
1128  if (propagate_icvs) {
1129  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1130  FALSE);
1131  if (KMP_MASTER_TID(
1132  tid)) { // master already has copy in final destination; copy
1133  copy_icvs(&thr_bar->th_fixed_icvs,
1134  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1135  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1136  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1137  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1138  // leaves (on-core children) pull parent's fixed ICVs directly to local
1139  // ICV store
1140  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1141  &thr_bar->parent_bar->th_fixed_icvs);
1142  // non-leaves will get ICVs piggybacked with b_go via NGO store
1143  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1144  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1145  // access
1146  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1147  else // leaves copy parent's fixed ICVs directly to local ICV store
1148  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149  &thr_bar->parent_bar->th_fixed_icvs);
1150  }
1151  }
1152 #endif // KMP_BARRIER_ICV_PUSH
1153 
1154  // Now, release my children
1155  if (thr_bar->my_level) { // not a leaf
1156  kmp_int32 child_tid;
1157  kmp_uint32 last;
1158  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1159  thr_bar->use_oncore_barrier) {
1160  if (KMP_MASTER_TID(tid)) { // do a flat release
1161  // Set local b_go to bump children via NGO store of the cache line
1162  // containing IVCs and b_go.
1163  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1164  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1165  // the cache line
1166  ngo_load(&thr_bar->th_fixed_icvs);
1167  // This loops over all the threads skipping only the leaf nodes in the
1168  // hierarchy
1169  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1170  child_tid += thr_bar->skip_per_level[1]) {
1171  kmp_bstate_t *child_bar =
1172  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1173  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1174  "releasing T#%d(%d:%d)"
1175  " go(%p): %u => %u\n",
1176  gtid, team->t.t_id, tid,
1177  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1178  child_tid, &child_bar->b_go, child_bar->b_go,
1179  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1180  // Use ngo store (if available) to both store ICVs and release child
1181  // via child's b_go
1182  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1183  }
1184  ngo_sync();
1185  }
1186  TCW_8(thr_bar->b_go,
1187  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1188  // Now, release leaf children
1189  if (thr_bar->leaf_kids) { // if there are any
1190  // We test team_change on the off-chance that the level 1 team changed.
1191  if (team_change ||
1192  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1193  if (old_leaf_kids) { // release old leaf kids
1194  thr_bar->b_go |= old_leaf_state;
1195  }
1196  // Release new leaf kids
1197  last = tid + thr_bar->skip_per_level[1];
1198  if (last > nproc)
1199  last = nproc;
1200  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1201  ++child_tid) { // skip_per_level[0]=1
1202  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1203  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1204  KA_TRACE(
1205  20,
1206  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1207  " T#%d(%d:%d) go(%p): %u => %u\n",
1208  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1209  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1210  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1211  // Release child using child's b_go flag
1212  ANNOTATE_BARRIER_BEGIN(child_thr);
1213  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1214  flag.release();
1215  }
1216  } else { // Release all children at once with leaf_state bits on my own
1217  // b_go flag
1218  thr_bar->b_go |= thr_bar->leaf_state;
1219  }
1220  }
1221  } else { // Blocktime is not infinite; do a simple hierarchical release
1222  for (int d = thr_bar->my_level - 1; d >= 0;
1223  --d) { // Release highest level threads first
1224  last = tid + thr_bar->skip_per_level[d + 1];
1225  kmp_uint32 skip = thr_bar->skip_per_level[d];
1226  if (last > nproc)
1227  last = nproc;
1228  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1229  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1230  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1231  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1232  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1233  gtid, team->t.t_id, tid,
1234  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1235  child_tid, &child_bar->b_go, child_bar->b_go,
1236  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1237  // Release child using child's b_go flag
1238  ANNOTATE_BARRIER_BEGIN(child_thr);
1239  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1240  flag.release();
1241  }
1242  }
1243  }
1244 #if KMP_BARRIER_ICV_PUSH
1245  if (propagate_icvs && !KMP_MASTER_TID(tid))
1246  // non-leaves copy ICVs from fixed ICVs to local dest
1247  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1248  &thr_bar->th_fixed_icvs);
1249 #endif // KMP_BARRIER_ICV_PUSH
1250  }
1251  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1252  "barrier type %d\n",
1253  gtid, team->t.t_id, tid, bt));
1254 }
1255 
1256 // End of Barrier Algorithms
1257 
1258 // type traits for cancellable value
1259 // if cancellable is true, then is_cancellable is a normal boolean variable
1260 // if cancellable is false, then is_cancellable is a compile time constant
1261 template <bool cancellable> struct is_cancellable {};
1262 template <> struct is_cancellable<true> {
1263  bool value;
1264  is_cancellable() : value(false) {}
1265  is_cancellable(bool b) : value(b) {}
1266  is_cancellable &operator=(bool b) {
1267  value = b;
1268  return *this;
1269  }
1270  operator bool() const { return value; }
1271 };
1272 template <> struct is_cancellable<false> {
1273  is_cancellable &operator=(bool b) { return *this; }
1274  constexpr operator bool() const { return false; }
1275 };
1276 
1277 // Internal function to do a barrier.
1278 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1279  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1280  barrier
1281  When cancellable = false,
1282  Returns 0 if master thread, 1 if worker thread.
1283  When cancellable = true
1284  Returns 0 if not cancelled, 1 if cancelled. */
1285 template <bool cancellable = false>
1286 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1287  size_t reduce_size, void *reduce_data,
1288  void (*reduce)(void *, void *)) {
1289  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1290  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1291  int tid = __kmp_tid_from_gtid(gtid);
1292  kmp_info_t *this_thr = __kmp_threads[gtid];
1293  kmp_team_t *team = this_thr->th.th_team;
1294  int status = 0;
1295  is_cancellable<cancellable> cancelled;
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297  ompt_data_t *my_task_data;
1298  ompt_data_t *my_parallel_data;
1299  void *return_address;
1300  ompt_sync_region_t barrier_kind;
1301 #endif
1302 
1303  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1304  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1305 
1306  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1307 #if OMPT_SUPPORT
1308  if (ompt_enabled.enabled) {
1309 #if OMPT_OPTIONAL
1310  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1311  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1312  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1313  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1314  if (ompt_enabled.ompt_callback_sync_region) {
1315  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1316  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1317  return_address);
1318  }
1319  if (ompt_enabled.ompt_callback_sync_region_wait) {
1320  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1321  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1322  return_address);
1323  }
1324 #endif
1325  // It is OK to report the barrier state after the barrier begin callback.
1326  // According to the OMPT specification, a compliant implementation may
1327  // even delay reporting this state until the barrier begins to wait.
1328  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1329  }
1330 #endif
1331 
1332  if (!team->t.t_serialized) {
1333 #if USE_ITT_BUILD
1334  // This value will be used in itt notify events below.
1335  void *itt_sync_obj = NULL;
1336 #if USE_ITT_NOTIFY
1337  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1338  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1339 #endif
1340 #endif /* USE_ITT_BUILD */
1341  if (__kmp_tasking_mode == tskm_extra_barrier) {
1342  __kmp_tasking_barrier(team, this_thr, gtid);
1343  KA_TRACE(15,
1344  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1345  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1346  }
1347 
1348  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1349  access it when the team struct is not guaranteed to exist. */
1350  // See note about the corresponding code in __kmp_join_barrier() being
1351  // performance-critical.
1352  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1353 #if KMP_USE_MONITOR
1354  this_thr->th.th_team_bt_intervals =
1355  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1356  this_thr->th.th_team_bt_set =
1357  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1358 #else
1359  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1360 #endif
1361  }
1362 
1363 #if USE_ITT_BUILD
1364  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1365  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1366 #endif /* USE_ITT_BUILD */
1367 #if USE_DEBUGGER
1368  // Let the debugger know: the thread arrived to the barrier and waiting.
1369  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1370  team->t.t_bar[bt].b_master_arrived += 1;
1371  } else {
1372  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1373  } // if
1374 #endif /* USE_DEBUGGER */
1375  if (reduce != NULL) {
1376  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1377  this_thr->th.th_local.reduce_data = reduce_data;
1378  }
1379 
1380  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1381  // use 0 to only setup the current team if nthreads > 1
1382  __kmp_task_team_setup(this_thr, team, 0);
1383 
1384  if (cancellable) {
1385  cancelled = __kmp_linear_barrier_gather_cancellable(
1386  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1387  } else {
1388  switch (__kmp_barrier_gather_pattern[bt]) {
1389  case bp_hyper_bar: {
1390  // don't set branch bits to 0; use linear
1391  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1392  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1393  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1394  break;
1395  }
1396  case bp_hierarchical_bar: {
1397  __kmp_hierarchical_barrier_gather(
1398  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1399  break;
1400  }
1401  case bp_tree_bar: {
1402  // don't set branch bits to 0; use linear
1403  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1404  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1405  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1406  break;
1407  }
1408  default: {
1409  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1410  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1411  }
1412  }
1413  }
1414 
1415  KMP_MB();
1416 
1417  if (KMP_MASTER_TID(tid)) {
1418  status = 0;
1419  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1420  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1421  }
1422 #if USE_DEBUGGER
1423  // Let the debugger know: All threads are arrived and starting leaving the
1424  // barrier.
1425  team->t.t_bar[bt].b_team_arrived += 1;
1426 #endif
1427 
1428 #if OMP_40_ENABLED
1429  if (__kmp_omp_cancellation) {
1430  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1431  // Reset cancellation flag for worksharing constructs
1432  if (cancel_request == cancel_loop ||
1433  cancel_request == cancel_sections) {
1434  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1435  }
1436  }
1437 #endif
1438 #if USE_ITT_BUILD
1439  /* TODO: In case of split reduction barrier, master thread may send
1440  acquired event early, before the final summation into the shared
1441  variable is done (final summation can be a long operation for array
1442  reductions). */
1443  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1444  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1445 #endif /* USE_ITT_BUILD */
1446 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1447  // Barrier - report frame end (only if active_level == 1)
1448  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1449  __kmp_forkjoin_frames_mode &&
1450 #if OMP_40_ENABLED
1451  this_thr->th.th_teams_microtask == NULL &&
1452 #endif
1453  team->t.t_active_level == 1) {
1454  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1455  kmp_uint64 cur_time = __itt_get_timestamp();
1456  kmp_info_t **other_threads = team->t.t_threads;
1457  int nproc = this_thr->th.th_team_nproc;
1458  int i;
1459  switch (__kmp_forkjoin_frames_mode) {
1460  case 1:
1461  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1462  loc, nproc);
1463  this_thr->th.th_frame_time = cur_time;
1464  break;
1465  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1466  // be fixed)
1467  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1468  1, loc, nproc);
1469  break;
1470  case 3:
1471  if (__itt_metadata_add_ptr) {
1472  // Initialize with master's wait time
1473  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1474  // Set arrive time to zero to be able to check it in
1475  // __kmp_invoke_task(); the same is done inside the loop below
1476  this_thr->th.th_bar_arrive_time = 0;
1477  for (i = 1; i < nproc; ++i) {
1478  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1479  other_threads[i]->th.th_bar_arrive_time = 0;
1480  }
1481  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1482  cur_time, delta,
1483  (kmp_uint64)(reduce != NULL));
1484  }
1485  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1486  loc, nproc);
1487  this_thr->th.th_frame_time = cur_time;
1488  break;
1489  }
1490  }
1491 #endif /* USE_ITT_BUILD */
1492  } else {
1493  status = 1;
1494 #if USE_ITT_BUILD
1495  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1496  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1497 #endif /* USE_ITT_BUILD */
1498  }
1499  if ((status == 1 || !is_split) && !cancelled) {
1500  if (cancellable) {
1501  cancelled = __kmp_linear_barrier_release_cancellable(
1502  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1503  } else {
1504  switch (__kmp_barrier_release_pattern[bt]) {
1505  case bp_hyper_bar: {
1506  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1507  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1508  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1509  break;
1510  }
1511  case bp_hierarchical_bar: {
1512  __kmp_hierarchical_barrier_release(
1513  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1514  break;
1515  }
1516  case bp_tree_bar: {
1517  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1518  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1519  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1520  break;
1521  }
1522  default: {
1523  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1524  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1525  }
1526  }
1527  }
1528  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1529  __kmp_task_team_sync(this_thr, team);
1530  }
1531  }
1532 
1533 #if USE_ITT_BUILD
1534  /* GEH: TODO: Move this under if-condition above and also include in
1535  __kmp_end_split_barrier(). This will more accurately represent the actual
1536  release time of the threads for split barriers. */
1537  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1538  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1539 #endif /* USE_ITT_BUILD */
1540  } else { // Team is serialized.
1541  status = 0;
1542  if (__kmp_tasking_mode != tskm_immediate_exec) {
1543 #if OMP_45_ENABLED
1544  if (this_thr->th.th_task_team != NULL) {
1545 #if USE_ITT_NOTIFY
1546  void *itt_sync_obj = NULL;
1547  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1548  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1549  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1550  }
1551 #endif
1552 
1553  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1554  TRUE);
1555  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1556  __kmp_task_team_setup(this_thr, team, 0);
1557 
1558 #if USE_ITT_BUILD
1559  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1560  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1561 #endif /* USE_ITT_BUILD */
1562  }
1563 #else
1564  // The task team should be NULL for serialized code (tasks will be
1565  // executed immediately)
1566  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1567  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1568 #endif
1569  }
1570  }
1571  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1572  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1573  __kmp_tid_from_gtid(gtid), status));
1574 
1575 #if OMPT_SUPPORT
1576  if (ompt_enabled.enabled) {
1577 #if OMPT_OPTIONAL
1578  if (ompt_enabled.ompt_callback_sync_region_wait) {
1579  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1580  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581  return_address);
1582  }
1583  if (ompt_enabled.ompt_callback_sync_region) {
1584  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1585  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1586  return_address);
1587  }
1588 #endif
1589  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1590  }
1591 #endif
1592  ANNOTATE_BARRIER_END(&team->t.t_bar);
1593 
1594  if (cancellable)
1595  return (int)cancelled;
1596  return status;
1597 }
1598 
1599 // Returns 0 if master thread, 1 if worker thread.
1600 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1601  size_t reduce_size, void *reduce_data,
1602  void (*reduce)(void *, void *)) {
1603  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1604  reduce);
1605 }
1606 
1607 #if defined(KMP_GOMP_COMPAT)
1608 // Returns 1 if cancelled, 0 otherwise
1609 int __kmp_barrier_gomp_cancel(int gtid) {
1610  if (__kmp_omp_cancellation) {
1611  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1612  0, NULL, NULL);
1613  if (cancelled) {
1614  int tid = __kmp_tid_from_gtid(gtid);
1615  kmp_info_t *this_thr = __kmp_threads[gtid];
1616  if (KMP_MASTER_TID(tid)) {
1617  // Master does not need to revert anything
1618  } else {
1619  // Workers need to revert their private b_arrived flag
1620  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1621  KMP_BARRIER_STATE_BUMP;
1622  }
1623  }
1624  return cancelled;
1625  }
1626  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1627  return FALSE;
1628 }
1629 #endif
1630 
1631 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1632  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1633  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1634  int tid = __kmp_tid_from_gtid(gtid);
1635  kmp_info_t *this_thr = __kmp_threads[gtid];
1636  kmp_team_t *team = this_thr->th.th_team;
1637 
1638  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1639  if (!team->t.t_serialized) {
1640  if (KMP_MASTER_GTID(gtid)) {
1641  switch (__kmp_barrier_release_pattern[bt]) {
1642  case bp_hyper_bar: {
1643  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1644  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1645  FALSE USE_ITT_BUILD_ARG(NULL));
1646  break;
1647  }
1648  case bp_hierarchical_bar: {
1649  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1650  FALSE USE_ITT_BUILD_ARG(NULL));
1651  break;
1652  }
1653  case bp_tree_bar: {
1654  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1655  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1656  FALSE USE_ITT_BUILD_ARG(NULL));
1657  break;
1658  }
1659  default: {
1660  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1661  FALSE USE_ITT_BUILD_ARG(NULL));
1662  }
1663  }
1664  if (__kmp_tasking_mode != tskm_immediate_exec) {
1665  __kmp_task_team_sync(this_thr, team);
1666  } // if
1667  }
1668  }
1669  ANNOTATE_BARRIER_END(&team->t.t_bar);
1670 }
1671 
1672 void __kmp_join_barrier(int gtid) {
1673  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1674  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1675  kmp_info_t *this_thr = __kmp_threads[gtid];
1676  kmp_team_t *team;
1677  kmp_uint nproc;
1678  kmp_info_t *master_thread;
1679  int tid;
1680 #ifdef KMP_DEBUG
1681  int team_id;
1682 #endif /* KMP_DEBUG */
1683 #if USE_ITT_BUILD
1684  void *itt_sync_obj = NULL;
1685 #if USE_ITT_NOTIFY
1686  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1687  // Get object created at fork_barrier
1688  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1689 #endif
1690 #endif /* USE_ITT_BUILD */
1691  KMP_MB();
1692 
1693  // Get current info
1694  team = this_thr->th.th_team;
1695  nproc = this_thr->th.th_team_nproc;
1696  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1697  tid = __kmp_tid_from_gtid(gtid);
1698 #ifdef KMP_DEBUG
1699  team_id = team->t.t_id;
1700 #endif /* KMP_DEBUG */
1701  master_thread = this_thr->th.th_team_master;
1702 #ifdef KMP_DEBUG
1703  if (master_thread != team->t.t_threads[0]) {
1704  __kmp_print_structure();
1705  }
1706 #endif /* KMP_DEBUG */
1707  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1708  KMP_MB();
1709 
1710  // Verify state
1711  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1712  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1713  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1714  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1715  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1716  gtid, team_id, tid));
1717 
1718  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1719 #if OMPT_SUPPORT
1720  if (ompt_enabled.enabled) {
1721 #if OMPT_OPTIONAL
1722  ompt_data_t *my_task_data;
1723  ompt_data_t *my_parallel_data;
1724  void *codeptr = NULL;
1725  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1726  if (KMP_MASTER_TID(ds_tid) &&
1727  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1728  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1729  codeptr = team->t.ompt_team_info.master_return_address;
1730  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1731  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1732  if (ompt_enabled.ompt_callback_sync_region) {
1733  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1734  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735  my_task_data, codeptr);
1736  }
1737  if (ompt_enabled.ompt_callback_sync_region_wait) {
1738  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1739  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1740  my_task_data, codeptr);
1741  }
1742  if (!KMP_MASTER_TID(ds_tid))
1743  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1744 #endif
1745  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1746  }
1747 #endif
1748 
1749  if (__kmp_tasking_mode == tskm_extra_barrier) {
1750  __kmp_tasking_barrier(team, this_thr, gtid);
1751  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1752  team_id, tid));
1753  }
1754 #ifdef KMP_DEBUG
1755  if (__kmp_tasking_mode != tskm_immediate_exec) {
1756  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1757  "%p, th_task_team = %p\n",
1758  __kmp_gtid_from_thread(this_thr), team_id,
1759  team->t.t_task_team[this_thr->th.th_task_state],
1760  this_thr->th.th_task_team));
1761  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1762  team->t.t_task_team[this_thr->th.th_task_state]);
1763  }
1764 #endif /* KMP_DEBUG */
1765 
1766  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1767  access it when the team struct is not guaranteed to exist. Doing these
1768  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1769  we do not perform the copy if blocktime=infinite, since the values are not
1770  used by __kmp_wait_template() in that case. */
1771  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1772 #if KMP_USE_MONITOR
1773  this_thr->th.th_team_bt_intervals =
1774  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1775  this_thr->th.th_team_bt_set =
1776  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1777 #else
1778  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1779 #endif
1780  }
1781 
1782 #if USE_ITT_BUILD
1783  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1784  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1785 #endif /* USE_ITT_BUILD */
1786 
1787  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1788  case bp_hyper_bar: {
1789  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1790  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792  break;
1793  }
1794  case bp_hierarchical_bar: {
1795  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1796  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1797  break;
1798  }
1799  case bp_tree_bar: {
1800  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1801  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1803  break;
1804  }
1805  default: {
1806  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1807  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1808  }
1809  }
1810 
1811  /* From this point on, the team data structure may be deallocated at any time
1812  by the master thread - it is unsafe to reference it in any of the worker
1813  threads. Any per-team data items that need to be referenced before the
1814  end of the barrier should be moved to the kmp_task_team_t structs. */
1815  if (KMP_MASTER_TID(tid)) {
1816  if (__kmp_tasking_mode != tskm_immediate_exec) {
1817  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1818  }
1819 #if OMP_50_ENABLED
1820  if (__kmp_display_affinity) {
1821  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1822  }
1823 #endif
1824 #if KMP_STATS_ENABLED
1825  // Have master thread flag the workers to indicate they are now waiting for
1826  // next parallel region, Also wake them up so they switch their timers to
1827  // idle.
1828  for (int i = 0; i < team->t.t_nproc; ++i) {
1829  kmp_info_t *team_thread = team->t.t_threads[i];
1830  if (team_thread == this_thr)
1831  continue;
1832  team_thread->th.th_stats->setIdleFlag();
1833  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1834  team_thread->th.th_sleep_loc != NULL)
1835  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1836  team_thread->th.th_sleep_loc);
1837  }
1838 #endif
1839 #if USE_ITT_BUILD
1840  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1841  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1842 #endif /* USE_ITT_BUILD */
1843 
1844 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1845  // Join barrier - report frame end
1846  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1847  __kmp_forkjoin_frames_mode &&
1848 #if OMP_40_ENABLED
1849  this_thr->th.th_teams_microtask == NULL &&
1850 #endif
1851  team->t.t_active_level == 1) {
1852  kmp_uint64 cur_time = __itt_get_timestamp();
1853  ident_t *loc = team->t.t_ident;
1854  kmp_info_t **other_threads = team->t.t_threads;
1855  int nproc = this_thr->th.th_team_nproc;
1856  int i;
1857  switch (__kmp_forkjoin_frames_mode) {
1858  case 1:
1859  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1860  loc, nproc);
1861  break;
1862  case 2:
1863  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1864  loc, nproc);
1865  break;
1866  case 3:
1867  if (__itt_metadata_add_ptr) {
1868  // Initialize with master's wait time
1869  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1870  // Set arrive time to zero to be able to check it in
1871  // __kmp_invoke_task(); the same is done inside the loop below
1872  this_thr->th.th_bar_arrive_time = 0;
1873  for (i = 1; i < nproc; ++i) {
1874  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1875  other_threads[i]->th.th_bar_arrive_time = 0;
1876  }
1877  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1878  cur_time, delta, 0);
1879  }
1880  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1881  loc, nproc);
1882  this_thr->th.th_frame_time = cur_time;
1883  break;
1884  }
1885  }
1886 #endif /* USE_ITT_BUILD */
1887  }
1888 #if USE_ITT_BUILD
1889  else {
1890  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1891  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1892  }
1893 #endif /* USE_ITT_BUILD */
1894 
1895 #if KMP_DEBUG
1896  if (KMP_MASTER_TID(tid)) {
1897  KA_TRACE(
1898  15,
1899  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1900  gtid, team_id, tid, nproc));
1901  }
1902 #endif /* KMP_DEBUG */
1903 
1904  // TODO now, mark worker threads as done so they may be disbanded
1905  KMP_MB(); // Flush all pending memory write invalidates.
1906  KA_TRACE(10,
1907  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1908 
1909  ANNOTATE_BARRIER_END(&team->t.t_bar);
1910 }
1911 
1912 // TODO release worker threads' fork barriers as we are ready instead of all at
1913 // once
1914 void __kmp_fork_barrier(int gtid, int tid) {
1915  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1916  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1917  kmp_info_t *this_thr = __kmp_threads[gtid];
1918  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1919 #if USE_ITT_BUILD
1920  void *itt_sync_obj = NULL;
1921 #endif /* USE_ITT_BUILD */
1922  if (team)
1923  ANNOTATE_BARRIER_END(&team->t.t_bar);
1924 
1925  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1926  (team != NULL) ? team->t.t_id : -1, tid));
1927 
1928  // th_team pointer only valid for master thread here
1929  if (KMP_MASTER_TID(tid)) {
1930 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1931  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1932  // Create itt barrier object
1933  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1934  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1935  }
1936 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1937 
1938 #ifdef KMP_DEBUG
1939  kmp_info_t **other_threads = team->t.t_threads;
1940  int i;
1941 
1942  // Verify state
1943  KMP_MB();
1944 
1945  for (i = 1; i < team->t.t_nproc; ++i) {
1946  KA_TRACE(500,
1947  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1948  "== %u.\n",
1949  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1950  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1951  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1952  KMP_DEBUG_ASSERT(
1953  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1954  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1955  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1956  }
1957 #endif
1958 
1959  if (__kmp_tasking_mode != tskm_immediate_exec) {
1960  // 0 indicates setup current task team if nthreads > 1
1961  __kmp_task_team_setup(this_thr, team, 0);
1962  }
1963 
1964  /* The master thread may have changed its blocktime between the join barrier
1965  and the fork barrier. Copy the blocktime info to the thread, where
1966  __kmp_wait_template() can access it when the team struct is not
1967  guaranteed to exist. */
1968  // See note about the corresponding code in __kmp_join_barrier() being
1969  // performance-critical
1970  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1971 #if KMP_USE_MONITOR
1972  this_thr->th.th_team_bt_intervals =
1973  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1974  this_thr->th.th_team_bt_set =
1975  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1976 #else
1977  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1978 #endif
1979  }
1980  } // master
1981 
1982  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1983  case bp_hyper_bar: {
1984  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1985  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1986  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1987  break;
1988  }
1989  case bp_hierarchical_bar: {
1990  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1991  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1992  break;
1993  }
1994  case bp_tree_bar: {
1995  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1996  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1997  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1998  break;
1999  }
2000  default: {
2001  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2002  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2003  }
2004  }
2005 
2006 #if OMPT_SUPPORT
2007  if (ompt_enabled.enabled &&
2008  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2009  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2010  ompt_data_t *task_data = (team)
2011  ? OMPT_CUR_TASK_DATA(this_thr)
2012  : &(this_thr->th.ompt_thread_info.task_data);
2013  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2014 #if OMPT_OPTIONAL
2015  void *codeptr = NULL;
2016  if (KMP_MASTER_TID(ds_tid) &&
2017  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2018  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2019  codeptr = team->t.ompt_team_info.master_return_address;
2020  if (ompt_enabled.ompt_callback_sync_region_wait) {
2021  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2022  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2023  codeptr);
2024  }
2025  if (ompt_enabled.ompt_callback_sync_region) {
2026  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2027  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2028  codeptr);
2029  }
2030 #endif
2031  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2032  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2033  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2034  }
2035  }
2036 #endif
2037 
2038  // Early exit for reaping threads releasing forkjoin barrier
2039  if (TCR_4(__kmp_global.g.g_done)) {
2040  this_thr->th.th_task_team = NULL;
2041 
2042 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2043  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2044  if (!KMP_MASTER_TID(tid)) {
2045  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2046  if (itt_sync_obj)
2047  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2048  }
2049  }
2050 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2051  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2052  return;
2053  }
2054 
2055  /* We can now assume that a valid team structure has been allocated by the
2056  master and propagated to all worker threads. The current thread, however,
2057  may not be part of the team, so we can't blindly assume that the team
2058  pointer is non-null. */
2059  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2060  KMP_DEBUG_ASSERT(team != NULL);
2061  tid = __kmp_tid_from_gtid(gtid);
2062 
2063 #if KMP_BARRIER_ICV_PULL
2064  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2065  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2066  implicit task has this data before this function is called. We cannot
2067  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2068  struct, because it is not always the case that the threads arrays have
2069  been allocated when __kmp_fork_call() is executed. */
2070  {
2071  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2072  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2073  // Copy the initial ICVs from the master's thread struct to the implicit
2074  // task for this tid.
2075  KA_TRACE(10,
2076  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2077  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2078  tid, FALSE);
2079  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2080  &team->t.t_threads[0]
2081  ->th.th_bar[bs_forkjoin_barrier]
2082  .bb.th_fixed_icvs);
2083  }
2084  }
2085 #endif // KMP_BARRIER_ICV_PULL
2086 
2087  if (__kmp_tasking_mode != tskm_immediate_exec) {
2088  __kmp_task_team_sync(this_thr, team);
2089  }
2090 
2091 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2092  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2093  if (proc_bind == proc_bind_intel) {
2094 #endif
2095 #if KMP_AFFINITY_SUPPORTED
2096  // Call dynamic affinity settings
2097  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2098  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2099  }
2100 #endif // KMP_AFFINITY_SUPPORTED
2101 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2102  } else if (proc_bind != proc_bind_false) {
2103  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2104  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2105  __kmp_gtid_from_thread(this_thr),
2106  this_thr->th.th_current_place));
2107  } else {
2108  __kmp_affinity_set_place(gtid);
2109  }
2110  }
2111 #endif
2112 #if OMP_50_ENABLED
2113  // Perform the display affinity functionality
2114  if (__kmp_display_affinity) {
2115  if (team->t.t_display_affinity
2116 #if KMP_AFFINITY_SUPPORTED
2117  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2118 #endif
2119  ) {
2120  // NULL means use the affinity-format-var ICV
2121  __kmp_aux_display_affinity(gtid, NULL);
2122  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2123  this_thr->th.th_prev_level = team->t.t_level;
2124  }
2125  }
2126  if (!KMP_MASTER_TID(tid))
2127  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2128 #endif
2129 
2130 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2131  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2132  if (!KMP_MASTER_TID(tid)) {
2133  // Get correct barrier object
2134  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2135  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2136  } // (prepare called inside barrier_release)
2137  }
2138 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2139  ANNOTATE_BARRIER_END(&team->t.t_bar);
2140  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2141  team->t.t_id, tid));
2142 }
2143 
2144 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2145  kmp_internal_control_t *new_icvs, ident_t *loc) {
2146  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2147 
2148  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2149  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2150 
2151 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2152  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2153  implicit task has this data before this function is called. */
2154 #if KMP_BARRIER_ICV_PULL
2155  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2156  untouched), where all of the worker threads can access them and make their
2157  own copies after the barrier. */
2158  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2159  // allocated at this point
2160  copy_icvs(
2161  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2162  new_icvs);
2163  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2164  team->t.t_threads[0], team));
2165 #elif KMP_BARRIER_ICV_PUSH
2166  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2167  // done here.
2168  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2169  team->t.t_threads[0], team));
2170 #else
2171  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2172  // time.
2173  ngo_load(new_icvs);
2174  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2175  // allocated at this point
2176  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2177  // TODO: GEH - pass in better source location info since usually NULL here
2178  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2179  f, team->t.t_threads[f], team));
2180  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2181  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2182  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2183  f, team->t.t_threads[f], team));
2184  }
2185  ngo_sync();
2186 #endif // KMP_BARRIER_ICV_PULL
2187 }
Definition: kmp.h:223