LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46  KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
50 #if OMP_50_ENABLED
51  "5.0 (201611)";
52 #elif OMP_45_ENABLED
53  "4.5 (201511)";
54 #elif OMP_40_ENABLED
55  "4.0 (201307)";
56 #else
57  "3.1 (201107)";
58 #endif
59 
60 #ifdef KMP_DEBUG
61 char const __kmp_version_lock[] =
62  KMP_VERSION_PREFIX "lock type: run time selectable";
63 #endif /* KMP_DEBUG */
64 
65 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
66 
67 /* ------------------------------------------------------------------------ */
68 
69 #if KMP_USE_MONITOR
70 kmp_info_t __kmp_monitor;
71 #endif
72 
73 /* Forward declarations */
74 
75 void __kmp_cleanup(void);
76 
77 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
78  int gtid);
79 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
80  kmp_internal_control_t *new_icvs,
81  ident_t *loc);
82 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
83 static void __kmp_partition_places(kmp_team_t *team,
84  int update_master_only = 0);
85 #endif
86 static void __kmp_do_serial_initialize(void);
87 void __kmp_fork_barrier(int gtid, int tid);
88 void __kmp_join_barrier(int gtid);
89 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
90  kmp_internal_control_t *new_icvs, ident_t *loc);
91 
92 #ifdef USE_LOAD_BALANCE
93 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
94 #endif
95 
96 static int __kmp_expand_threads(int nNeed);
97 #if KMP_OS_WINDOWS
98 static int __kmp_unregister_root_other_thread(int gtid);
99 #endif
100 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108  int i;
109  kmp_info_t **other_threads;
110  size_t stack_data;
111  char *stack_addr;
112  size_t stack_size;
113  char *stack_base;
114 
115  KA_TRACE(
116  1000,
117  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
118  __kmp_nth, __kmp_all_nth));
119 
120  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123  __kmp_init_gtid for this to work. */
124 
125  if (!TCR_4(__kmp_init_gtid))
126  return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129  if (TCR_4(__kmp_gtid_mode) >= 3) {
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131  return __kmp_gtid;
132  }
133 #endif
134  if (TCR_4(__kmp_gtid_mode) >= 2) {
135  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136  return __kmp_gtid_get_specific();
137  }
138  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140  stack_addr = (char *)&stack_data;
141  other_threads = __kmp_threads;
142 
143  /* ATT: The code below is a source of potential bugs due to unsynchronized
144  access to __kmp_threads array. For example:
145  1. Current thread loads other_threads[i] to thr and checks it, it is
146  non-NULL.
147  2. Current thread is suspended by OS.
148  3. Another thread unregisters and finishes (debug versions of free()
149  may fill memory with something like 0xEF).
150  4. Current thread is resumed.
151  5. Current thread reads junk from *thr.
152  TODO: Fix it. --ln */
153 
154  for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157  if (!thr)
158  continue;
159 
160  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163  /* stack grows down -- search through all of the active threads */
164 
165  if (stack_addr <= stack_base) {
166  size_t stack_diff = stack_base - stack_addr;
167 
168  if (stack_diff <= stack_size) {
169  /* The only way we can be closer than the allocated */
170  /* stack size is if we are running on this thread. */
171  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172  return i;
173  }
174  }
175  }
176 
177  /* get specific to try and determine our gtid */
178  KA_TRACE(1000,
179  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180  "thread, using TLS\n"));
181  i = __kmp_gtid_get_specific();
182 
183  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
184 
185  /* if we havn't been assigned a gtid, then return code */
186  if (i < 0)
187  return i;
188 
189  /* dynamically updated stack window for uber threads to avoid get_specific
190  call */
191  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192  KMP_FATAL(StackOverflow, i);
193  }
194 
195  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196  if (stack_addr > stack_base) {
197  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200  stack_base);
201  } else {
202  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203  stack_base - stack_addr);
204  }
205 
206  /* Reprint stack bounds for ubermaster since they have been refined */
207  if (__kmp_storage_map) {
208  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211  other_threads[i]->th.th_info.ds.ds_stacksize,
212  "th_%d stack (refinement)", i);
213  }
214  return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218  int gtid;
219 
220  if (!__kmp_init_serial) {
221  gtid = KMP_GTID_DNE;
222  } else
223 #ifdef KMP_TDATA_GTID
224  if (TCR_4(__kmp_gtid_mode) >= 3) {
225  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226  gtid = __kmp_gtid;
227  } else
228 #endif
229  if (TCR_4(__kmp_gtid_mode) >= 2) {
230  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231  gtid = __kmp_gtid_get_specific();
232  } else {
233  KA_TRACE(1000,
234  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235  gtid = __kmp_get_global_thread_id();
236  }
237 
238  /* we must be a new uber master sibling thread */
239  if (gtid == KMP_GTID_DNE) {
240  KA_TRACE(10,
241  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242  "Registering a new gtid.\n"));
243  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244  if (!__kmp_init_serial) {
245  __kmp_do_serial_initialize();
246  gtid = __kmp_gtid_get_specific();
247  } else {
248  gtid = __kmp_register_root(FALSE);
249  }
250  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252  }
253 
254  KMP_DEBUG_ASSERT(gtid >= 0);
255 
256  return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261  int f;
262  char *stack_beg = NULL;
263  char *stack_end = NULL;
264  int gtid;
265 
266  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267  if (__kmp_storage_map) {
268  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271  gtid = __kmp_gtid_from_thread(th);
272 
273  if (gtid == KMP_GTID_MONITOR) {
274  __kmp_print_storage_map_gtid(
275  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276  "th_%s stack (%s)", "mon",
277  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278  } else {
279  __kmp_print_storage_map_gtid(
280  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281  "th_%d stack (%s)", gtid,
282  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283  }
284  }
285 
286  /* No point in checking ubermaster threads since they use refinement and
287  * cannot overlap */
288  gtid = __kmp_gtid_from_thread(th);
289  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290  KA_TRACE(10,
291  ("__kmp_check_stack_overlap: performing extensive checking\n"));
292  if (stack_beg == NULL) {
293  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295  }
296 
297  for (f = 0; f < __kmp_threads_capacity; f++) {
298  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300  if (f_th && f_th != th) {
301  char *other_stack_end =
302  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303  char *other_stack_beg =
304  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308  /* Print the other stack values before the abort */
309  if (__kmp_storage_map)
310  __kmp_print_storage_map_gtid(
311  -1, other_stack_beg, other_stack_end,
312  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316  __kmp_msg_null);
317  }
318  }
319  }
320  }
321  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327  static int done = FALSE;
328 
329  while (!done) {
330  KMP_YIELD(TRUE);
331  }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337  char const *format, ...) {
338  char buffer[MAX_MESSAGE];
339  va_list ap;
340 
341  va_start(ap, format);
342  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343  p2, (unsigned long)size, format);
344  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345  __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347  int node;
348  if (gtid >= 0) {
349  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350  if (__kmp_storage_map_verbose) {
351  node = __kmp_get_host_node(p1);
352  if (node < 0) /* doesn't work, so don't try this next time */
353  __kmp_storage_map_verbose = FALSE;
354  else {
355  char *last;
356  int lastNode;
357  int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359  const int page_size = KMP_GET_PAGE_SIZE();
360 
361  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363  if (localProc >= 0)
364  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
365  localProc >> 1);
366  else
367  __kmp_printf_no_lock(" GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369  /* The more elaborate format is disabled for now because of the prctl
370  * hanging bug. */
371  do {
372  last = p1;
373  lastNode = node;
374  /* This loop collates adjacent pages with the same host node. */
375  do {
376  (char *)p1 += page_size;
377  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
379  lastNode);
380  } while (p1 <= p2);
381 #else
382  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
383  (char *)p1 + (page_size - 1),
384  __kmp_get_host_node(p1));
385  if (p1 < p2) {
386  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
387  (char *)p2 + (page_size - 1),
388  __kmp_get_host_node(p2));
389  }
390 #endif
391  }
392  }
393  } else
394  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
395  }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401  char buffer[MAX_MESSAGE];
402  va_list ap;
403 
404  if (__kmp_generate_warnings == kmp_warnings_off) {
405  return;
406  }
407 
408  va_start(ap, format);
409 
410  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412  __kmp_vprintf(kmp_err, buffer, ap);
413  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415  va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419  // Later threads may stall here, but that's ok because abort() will kill them.
420  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422  if (__kmp_debug_buf) {
423  __kmp_dump_debug_buffer();
424  }
425 
426  if (KMP_OS_WINDOWS) {
427  // Let other threads know of abnormal termination and prevent deadlock
428  // if abort happened during library initialization or shutdown
429  __kmp_global.g.g_abort = SIGABRT;
430 
431  /* On Windows* OS by default abort() causes pop-up error box, which stalls
432  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433  boxes. _set_abort_behavior() works well, but this function is not
434  available in VS7 (this is not problem for DLL, but it is a problem for
435  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436  help, at least in some versions of MS C RTL.
437 
438  It seems following sequence is the only way to simulate abort() and
439  avoid pop-up error box. */
440  raise(SIGABRT);
441  _exit(3); // Just in case, if signal ignored, exit anyway.
442  } else {
443  abort();
444  }
445 
446  __kmp_infinite_loop();
447  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
448 
449 } // __kmp_abort_process
450 
451 void __kmp_abort_thread(void) {
452  // TODO: Eliminate g_abort global variable and this function.
453  // In case of abort just call abort(), it will kill all the threads.
454  __kmp_infinite_loop();
455 } // __kmp_abort_thread
456 
457 /* Print out the storage map for the major kmp_info_t thread data structures
458  that are allocated together. */
459 
460 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
461  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
462  gtid);
463 
464  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
465  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
466 
467  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
468  sizeof(kmp_local_t), "th_%d.th_local", gtid);
469 
470  __kmp_print_storage_map_gtid(
471  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
472  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
473 
474  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
475  &thr->th.th_bar[bs_plain_barrier + 1],
476  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
477  gtid);
478 
479  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480  &thr->th.th_bar[bs_forkjoin_barrier + 1],
481  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
482  gtid);
483 
484 #if KMP_FAST_REDUCTION_BARRIER
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
486  &thr->th.th_bar[bs_reduction_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
488  gtid);
489 #endif // KMP_FAST_REDUCTION_BARRIER
490 }
491 
492 /* Print out the storage map for the major kmp_team_t team data structures
493  that are allocated together. */
494 
495 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
496  int team_id, int num_thr) {
497  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
498  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
499  header, team_id);
500 
501  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
502  &team->t.t_bar[bs_last_barrier],
503  sizeof(kmp_balign_team_t) * bs_last_barrier,
504  "%s_%d.t_bar", header, team_id);
505 
506  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
507  &team->t.t_bar[bs_plain_barrier + 1],
508  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
509  header, team_id);
510 
511  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
512  &team->t.t_bar[bs_forkjoin_barrier + 1],
513  sizeof(kmp_balign_team_t),
514  "%s_%d.t_bar[forkjoin]", header, team_id);
515 
516 #if KMP_FAST_REDUCTION_BARRIER
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
518  &team->t.t_bar[bs_reduction_barrier + 1],
519  sizeof(kmp_balign_team_t),
520  "%s_%d.t_bar[reduction]", header, team_id);
521 #endif // KMP_FAST_REDUCTION_BARRIER
522 
523  __kmp_print_storage_map_gtid(
524  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
525  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
526 
527  __kmp_print_storage_map_gtid(
528  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
529  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
530 
531  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
532  &team->t.t_disp_buffer[num_disp_buff],
533  sizeof(dispatch_shared_info_t) * num_disp_buff,
534  "%s_%d.t_disp_buffer", header, team_id);
535 }
536 
537 static void __kmp_init_allocator() {
538 #if OMP_50_ENABLED
539  __kmp_init_memkind();
540 #endif
541 }
542 static void __kmp_fini_allocator() {
543 #if OMP_50_ENABLED
544  __kmp_fini_memkind();
545 #endif
546 }
547 
548 /* ------------------------------------------------------------------------ */
549 
550 #if KMP_DYNAMIC_LIB
551 #if KMP_OS_WINDOWS
552 
553 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
554  // TODO: Change to __kmp_break_bootstrap_lock().
555  __kmp_init_bootstrap_lock(lck); // make the lock released
556 }
557 
558 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
559  int i;
560  int thread_count;
561 
562  // PROCESS_DETACH is expected to be called by a thread that executes
563  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
564  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
565  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
566  // threads can be still alive here, although being about to be terminated. The
567  // threads in the array with ds_thread==0 are most suspicious. Actually, it
568  // can be not safe to access the __kmp_threads[].
569 
570  // TODO: does it make sense to check __kmp_roots[] ?
571 
572  // Let's check that there are no other alive threads registered with the OMP
573  // lib.
574  while (1) {
575  thread_count = 0;
576  for (i = 0; i < __kmp_threads_capacity; ++i) {
577  if (!__kmp_threads)
578  continue;
579  kmp_info_t *th = __kmp_threads[i];
580  if (th == NULL)
581  continue;
582  int gtid = th->th.th_info.ds.ds_gtid;
583  if (gtid == gtid_req)
584  continue;
585  if (gtid < 0)
586  continue;
587  DWORD exit_val;
588  int alive = __kmp_is_thread_alive(th, &exit_val);
589  if (alive) {
590  ++thread_count;
591  }
592  }
593  if (thread_count == 0)
594  break; // success
595  }
596 
597  // Assume that I'm alone. Now it might be safe to check and reset locks.
598  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
599  __kmp_reset_lock(&__kmp_forkjoin_lock);
600 #ifdef KMP_DEBUG
601  __kmp_reset_lock(&__kmp_stdio_lock);
602 #endif // KMP_DEBUG
603 }
604 
605 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
606  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
607 
608  switch (fdwReason) {
609 
610  case DLL_PROCESS_ATTACH:
611  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
612 
613  return TRUE;
614 
615  case DLL_PROCESS_DETACH:
616  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
617 
618  if (lpReserved != NULL) {
619  // lpReserved is used for telling the difference:
620  // lpReserved == NULL when FreeLibrary() was called,
621  // lpReserved != NULL when the process terminates.
622  // When FreeLibrary() is called, worker threads remain alive. So they will
623  // release the forkjoin lock by themselves. When the process terminates,
624  // worker threads disappear triggering the problem of unreleased forkjoin
625  // lock as described below.
626 
627  // A worker thread can take the forkjoin lock. The problem comes up if
628  // that worker thread becomes dead before it releases the forkjoin lock.
629  // The forkjoin lock remains taken, while the thread executing
630  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
631  // to take the forkjoin lock and will always fail, so that the application
632  // will never finish [normally]. This scenario is possible if
633  // __kmpc_end() has not been executed. It looks like it's not a corner
634  // case, but common cases:
635  // - the main function was compiled by an alternative compiler;
636  // - the main function was compiled by icl but without /Qopenmp
637  // (application with plugins);
638  // - application terminates by calling C exit(), Fortran CALL EXIT() or
639  // Fortran STOP.
640  // - alive foreign thread prevented __kmpc_end from doing cleanup.
641  //
642  // This is a hack to work around the problem.
643  // TODO: !!! figure out something better.
644  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
645  }
646 
647  __kmp_internal_end_library(__kmp_gtid_get_specific());
648 
649  return TRUE;
650 
651  case DLL_THREAD_ATTACH:
652  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
653 
654  /* if we want to register new siblings all the time here call
655  * __kmp_get_gtid(); */
656  return TRUE;
657 
658  case DLL_THREAD_DETACH:
659  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
660 
661  __kmp_internal_end_thread(__kmp_gtid_get_specific());
662  return TRUE;
663  }
664 
665  return TRUE;
666 }
667 
668 #endif /* KMP_OS_WINDOWS */
669 #endif /* KMP_DYNAMIC_LIB */
670 
671 /* __kmp_parallel_deo -- Wait until it's our turn. */
672 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673  int gtid = *gtid_ref;
674 #ifdef BUILD_PARALLEL_ORDERED
675  kmp_team_t *team = __kmp_team_from_gtid(gtid);
676 #endif /* BUILD_PARALLEL_ORDERED */
677 
678  if (__kmp_env_consistency_check) {
679  if (__kmp_threads[gtid]->th.th_root->r.r_active)
680 #if KMP_USE_DYNAMIC_LOCK
681  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
682 #else
683  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
684 #endif
685  }
686 #ifdef BUILD_PARALLEL_ORDERED
687  if (!team->t.t_serialized) {
688  KMP_MB();
689  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
690  NULL);
691  KMP_MB();
692  }
693 #endif /* BUILD_PARALLEL_ORDERED */
694 }
695 
696 /* __kmp_parallel_dxo -- Signal the next task. */
697 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
698  int gtid = *gtid_ref;
699 #ifdef BUILD_PARALLEL_ORDERED
700  int tid = __kmp_tid_from_gtid(gtid);
701  kmp_team_t *team = __kmp_team_from_gtid(gtid);
702 #endif /* BUILD_PARALLEL_ORDERED */
703 
704  if (__kmp_env_consistency_check) {
705  if (__kmp_threads[gtid]->th.th_root->r.r_active)
706  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
707  }
708 #ifdef BUILD_PARALLEL_ORDERED
709  if (!team->t.t_serialized) {
710  KMP_MB(); /* Flush all pending memory write invalidates. */
711 
712  /* use the tid of the next thread in this team */
713  /* TODO replace with general release procedure */
714  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
715 
716  KMP_MB(); /* Flush all pending memory write invalidates. */
717  }
718 #endif /* BUILD_PARALLEL_ORDERED */
719 }
720 
721 /* ------------------------------------------------------------------------ */
722 /* The BARRIER for a SINGLE process section is always explicit */
723 
724 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
725  int status;
726  kmp_info_t *th;
727  kmp_team_t *team;
728 
729  if (!TCR_4(__kmp_init_parallel))
730  __kmp_parallel_initialize();
731 
732 #if OMP_50_ENABLED
733  __kmp_resume_if_soft_paused();
734 #endif
735 
736  th = __kmp_threads[gtid];
737  team = th->th.th_team;
738  status = 0;
739 
740  th->th.th_ident = id_ref;
741 
742  if (team->t.t_serialized) {
743  status = 1;
744  } else {
745  kmp_int32 old_this = th->th.th_local.this_construct;
746 
747  ++th->th.th_local.this_construct;
748  /* try to set team count to thread count--success means thread got the
749  single block */
750  /* TODO: Should this be acquire or release? */
751  if (team->t.t_construct == old_this) {
752  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
753  th->th.th_local.this_construct);
754  }
755 #if USE_ITT_BUILD
756  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
757  KMP_MASTER_GTID(gtid) &&
758 #if OMP_40_ENABLED
759  th->th.th_teams_microtask == NULL &&
760 #endif
761  team->t.t_active_level ==
762  1) { // Only report metadata by master of active team at level 1
763  __kmp_itt_metadata_single(id_ref);
764  }
765 #endif /* USE_ITT_BUILD */
766  }
767 
768  if (__kmp_env_consistency_check) {
769  if (status && push_ws) {
770  __kmp_push_workshare(gtid, ct_psingle, id_ref);
771  } else {
772  __kmp_check_workshare(gtid, ct_psingle, id_ref);
773  }
774  }
775 #if USE_ITT_BUILD
776  if (status) {
777  __kmp_itt_single_start(gtid);
778  }
779 #endif /* USE_ITT_BUILD */
780  return status;
781 }
782 
783 void __kmp_exit_single(int gtid) {
784 #if USE_ITT_BUILD
785  __kmp_itt_single_end(gtid);
786 #endif /* USE_ITT_BUILD */
787  if (__kmp_env_consistency_check)
788  __kmp_pop_workshare(gtid, ct_psingle, NULL);
789 }
790 
791 /* determine if we can go parallel or must use a serialized parallel region and
792  * how many threads we can use
793  * set_nproc is the number of threads requested for the team
794  * returns 0 if we should serialize or only use one thread,
795  * otherwise the number of threads to use
796  * The forkjoin lock is held by the caller. */
797 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
798  int master_tid, int set_nthreads
799 #if OMP_40_ENABLED
800  ,
801  int enter_teams
802 #endif /* OMP_40_ENABLED */
803  ) {
804  int capacity;
805  int new_nthreads;
806  KMP_DEBUG_ASSERT(__kmp_init_serial);
807  KMP_DEBUG_ASSERT(root && parent_team);
808  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
809 
810  // If dyn-var is set, dynamically adjust the number of desired threads,
811  // according to the method specified by dynamic_mode.
812  new_nthreads = set_nthreads;
813  if (!get__dynamic_2(parent_team, master_tid)) {
814  ;
815  }
816 #ifdef USE_LOAD_BALANCE
817  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
818  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
819  if (new_nthreads == 1) {
820  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
821  "reservation to 1 thread\n",
822  master_tid));
823  return 1;
824  }
825  if (new_nthreads < set_nthreads) {
826  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
827  "reservation to %d threads\n",
828  master_tid, new_nthreads));
829  }
830  }
831 #endif /* USE_LOAD_BALANCE */
832  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
833  new_nthreads = __kmp_avail_proc - __kmp_nth +
834  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
835  if (new_nthreads <= 1) {
836  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
837  "reservation to 1 thread\n",
838  master_tid));
839  return 1;
840  }
841  if (new_nthreads < set_nthreads) {
842  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
843  "reservation to %d threads\n",
844  master_tid, new_nthreads));
845  } else {
846  new_nthreads = set_nthreads;
847  }
848  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
849  if (set_nthreads > 2) {
850  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
851  new_nthreads = (new_nthreads % set_nthreads) + 1;
852  if (new_nthreads == 1) {
853  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
854  "reservation to 1 thread\n",
855  master_tid));
856  return 1;
857  }
858  if (new_nthreads < set_nthreads) {
859  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
860  "reservation to %d threads\n",
861  master_tid, new_nthreads));
862  }
863  }
864  } else {
865  KMP_ASSERT(0);
866  }
867 
868  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
869  if (__kmp_nth + new_nthreads -
870  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
871  __kmp_max_nth) {
872  int tl_nthreads = __kmp_max_nth - __kmp_nth +
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
874  if (tl_nthreads <= 0) {
875  tl_nthreads = 1;
876  }
877 
878  // If dyn-var is false, emit a 1-time warning.
879  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
880  __kmp_reserve_warn = 1;
881  __kmp_msg(kmp_ms_warning,
882  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
883  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
884  }
885  if (tl_nthreads == 1) {
886  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
887  "reduced reservation to 1 thread\n",
888  master_tid));
889  return 1;
890  }
891  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
892  "reservation to %d threads\n",
893  master_tid, tl_nthreads));
894  new_nthreads = tl_nthreads;
895  }
896 
897  // Respect OMP_THREAD_LIMIT
898  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
899  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
900  if (cg_nthreads + new_nthreads -
901  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
902  max_cg_threads) {
903  int tl_nthreads = max_cg_threads - cg_nthreads +
904  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
905  if (tl_nthreads <= 0) {
906  tl_nthreads = 1;
907  }
908 
909  // If dyn-var is false, emit a 1-time warning.
910  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
911  __kmp_reserve_warn = 1;
912  __kmp_msg(kmp_ms_warning,
913  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
914  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
915  }
916  if (tl_nthreads == 1) {
917  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
918  "reduced reservation to 1 thread\n",
919  master_tid));
920  return 1;
921  }
922  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
923  "reservation to %d threads\n",
924  master_tid, tl_nthreads));
925  new_nthreads = tl_nthreads;
926  }
927 
928  // Check if the threads array is large enough, or needs expanding.
929  // See comment in __kmp_register_root() about the adjustment if
930  // __kmp_threads[0] == NULL.
931  capacity = __kmp_threads_capacity;
932  if (TCR_PTR(__kmp_threads[0]) == NULL) {
933  --capacity;
934  }
935  if (__kmp_nth + new_nthreads -
936  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
937  capacity) {
938  // Expand the threads array.
939  int slotsRequired = __kmp_nth + new_nthreads -
940  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
941  capacity;
942  int slotsAdded = __kmp_expand_threads(slotsRequired);
943  if (slotsAdded < slotsRequired) {
944  // The threads array was not expanded enough.
945  new_nthreads -= (slotsRequired - slotsAdded);
946  KMP_ASSERT(new_nthreads >= 1);
947 
948  // If dyn-var is false, emit a 1-time warning.
949  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
950  __kmp_reserve_warn = 1;
951  if (__kmp_tp_cached) {
952  __kmp_msg(kmp_ms_warning,
953  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
954  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
955  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
956  } else {
957  __kmp_msg(kmp_ms_warning,
958  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
959  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
960  }
961  }
962  }
963  }
964 
965 #ifdef KMP_DEBUG
966  if (new_nthreads == 1) {
967  KC_TRACE(10,
968  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
969  "dead roots and rechecking; requested %d threads\n",
970  __kmp_get_gtid(), set_nthreads));
971  } else {
972  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
973  " %d threads\n",
974  __kmp_get_gtid(), new_nthreads, set_nthreads));
975  }
976 #endif // KMP_DEBUG
977  return new_nthreads;
978 }
979 
980 /* Allocate threads from the thread pool and assign them to the new team. We are
981  assured that there are enough threads available, because we checked on that
982  earlier within critical section forkjoin */
983 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
984  kmp_info_t *master_th, int master_gtid) {
985  int i;
986  int use_hot_team;
987 
988  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
989  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
990  KMP_MB();
991 
992  /* first, let's setup the master thread */
993  master_th->th.th_info.ds.ds_tid = 0;
994  master_th->th.th_team = team;
995  master_th->th.th_team_nproc = team->t.t_nproc;
996  master_th->th.th_team_master = master_th;
997  master_th->th.th_team_serialized = FALSE;
998  master_th->th.th_dispatch = &team->t.t_dispatch[0];
999 
1000 /* make sure we are not the optimized hot team */
1001 #if KMP_NESTED_HOT_TEAMS
1002  use_hot_team = 0;
1003  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1004  if (hot_teams) { // hot teams array is not allocated if
1005  // KMP_HOT_TEAMS_MAX_LEVEL=0
1006  int level = team->t.t_active_level - 1; // index in array of hot teams
1007  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1008  if (master_th->th.th_teams_size.nteams > 1) {
1009  ++level; // level was not increased in teams construct for
1010  // team_of_masters
1011  }
1012  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1013  master_th->th.th_teams_level == team->t.t_level) {
1014  ++level; // level was not increased in teams construct for
1015  // team_of_workers before the parallel
1016  } // team->t.t_level will be increased inside parallel
1017  }
1018  if (level < __kmp_hot_teams_max_level) {
1019  if (hot_teams[level].hot_team) {
1020  // hot team has already been allocated for given level
1021  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1022  use_hot_team = 1; // the team is ready to use
1023  } else {
1024  use_hot_team = 0; // AC: threads are not allocated yet
1025  hot_teams[level].hot_team = team; // remember new hot team
1026  hot_teams[level].hot_team_nth = team->t.t_nproc;
1027  }
1028  } else {
1029  use_hot_team = 0;
1030  }
1031  }
1032 #else
1033  use_hot_team = team == root->r.r_hot_team;
1034 #endif
1035  if (!use_hot_team) {
1036 
1037  /* install the master thread */
1038  team->t.t_threads[0] = master_th;
1039  __kmp_initialize_info(master_th, team, 0, master_gtid);
1040 
1041  /* now, install the worker threads */
1042  for (i = 1; i < team->t.t_nproc; i++) {
1043 
1044  /* fork or reallocate a new thread and install it in team */
1045  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1046  team->t.t_threads[i] = thr;
1047  KMP_DEBUG_ASSERT(thr);
1048  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1049  /* align team and thread arrived states */
1050  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1051  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1052  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1053  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1054  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1055  team->t.t_bar[bs_plain_barrier].b_arrived));
1056 #if OMP_40_ENABLED
1057  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1058  thr->th.th_teams_level = master_th->th.th_teams_level;
1059  thr->th.th_teams_size = master_th->th.th_teams_size;
1060 #endif
1061  { // Initialize threads' barrier data.
1062  int b;
1063  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1064  for (b = 0; b < bs_last_barrier; ++b) {
1065  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1066  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1067 #if USE_DEBUGGER
1068  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1069 #endif
1070  }
1071  }
1072  }
1073 
1074 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1075  __kmp_partition_places(team);
1076 #endif
1077  }
1078 
1079 #if OMP_50_ENABLED
1080  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1081  for (i = 0; i < team->t.t_nproc; i++) {
1082  kmp_info_t *thr = team->t.t_threads[i];
1083  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1084  thr->th.th_prev_level != team->t.t_level) {
1085  team->t.t_display_affinity = 1;
1086  break;
1087  }
1088  }
1089  }
1090 #endif
1091 
1092  KMP_MB();
1093 }
1094 
1095 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1096 // Propagate any changes to the floating point control registers out to the team
1097 // We try to avoid unnecessary writes to the relevant cache line in the team
1098 // structure, so we don't make changes unless they are needed.
1099 inline static void propagateFPControl(kmp_team_t *team) {
1100  if (__kmp_inherit_fp_control) {
1101  kmp_int16 x87_fpu_control_word;
1102  kmp_uint32 mxcsr;
1103 
1104  // Get master values of FPU control flags (both X87 and vector)
1105  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1106  __kmp_store_mxcsr(&mxcsr);
1107  mxcsr &= KMP_X86_MXCSR_MASK;
1108 
1109  // There is no point looking at t_fp_control_saved here.
1110  // If it is TRUE, we still have to update the values if they are different
1111  // from those we now have. If it is FALSE we didn't save anything yet, but
1112  // our objective is the same. We have to ensure that the values in the team
1113  // are the same as those we have.
1114  // So, this code achieves what we need whether or not t_fp_control_saved is
1115  // true. By checking whether the value needs updating we avoid unnecessary
1116  // writes that would put the cache-line into a written state, causing all
1117  // threads in the team to have to read it again.
1118  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1119  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1120  // Although we don't use this value, other code in the runtime wants to know
1121  // whether it should restore them. So we must ensure it is correct.
1122  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1123  } else {
1124  // Similarly here. Don't write to this cache-line in the team structure
1125  // unless we have to.
1126  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1127  }
1128 }
1129 
1130 // Do the opposite, setting the hardware registers to the updated values from
1131 // the team.
1132 inline static void updateHWFPControl(kmp_team_t *team) {
1133  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1134  // Only reset the fp control regs if they have been changed in the team.
1135  // the parallel region that we are exiting.
1136  kmp_int16 x87_fpu_control_word;
1137  kmp_uint32 mxcsr;
1138  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1139  __kmp_store_mxcsr(&mxcsr);
1140  mxcsr &= KMP_X86_MXCSR_MASK;
1141 
1142  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1143  __kmp_clear_x87_fpu_status_word();
1144  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1145  }
1146 
1147  if (team->t.t_mxcsr != mxcsr) {
1148  __kmp_load_mxcsr(&team->t.t_mxcsr);
1149  }
1150  }
1151 }
1152 #else
1153 #define propagateFPControl(x) ((void)0)
1154 #define updateHWFPControl(x) ((void)0)
1155 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1156 
1157 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1158  int realloc); // forward declaration
1159 
1160 /* Run a parallel region that has been serialized, so runs only in a team of the
1161  single master thread. */
1162 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1163  kmp_info_t *this_thr;
1164  kmp_team_t *serial_team;
1165 
1166  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1167 
1168  /* Skip all this code for autopar serialized loops since it results in
1169  unacceptable overhead */
1170  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1171  return;
1172 
1173  if (!TCR_4(__kmp_init_parallel))
1174  __kmp_parallel_initialize();
1175 
1176 #if OMP_50_ENABLED
1177  __kmp_resume_if_soft_paused();
1178 #endif
1179 
1180  this_thr = __kmp_threads[global_tid];
1181  serial_team = this_thr->th.th_serial_team;
1182 
1183  /* utilize the serialized team held by this thread */
1184  KMP_DEBUG_ASSERT(serial_team);
1185  KMP_MB();
1186 
1187  if (__kmp_tasking_mode != tskm_immediate_exec) {
1188  KMP_DEBUG_ASSERT(
1189  this_thr->th.th_task_team ==
1190  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1191  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1192  NULL);
1193  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1194  "team %p, new task_team = NULL\n",
1195  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1196  this_thr->th.th_task_team = NULL;
1197  }
1198 
1199 #if OMP_40_ENABLED
1200  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1201  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1202  proc_bind = proc_bind_false;
1203  } else if (proc_bind == proc_bind_default) {
1204  // No proc_bind clause was specified, so use the current value
1205  // of proc-bind-var for this parallel region.
1206  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1207  }
1208  // Reset for next parallel region
1209  this_thr->th.th_set_proc_bind = proc_bind_default;
1210 #endif /* OMP_40_ENABLED */
1211 
1212 #if OMPT_SUPPORT
1213  ompt_data_t ompt_parallel_data = ompt_data_none;
1214  ompt_data_t *implicit_task_data;
1215  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1216  if (ompt_enabled.enabled &&
1217  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1218 
1219  ompt_task_info_t *parent_task_info;
1220  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1221 
1222  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1223  if (ompt_enabled.ompt_callback_parallel_begin) {
1224  int team_size = 1;
1225 
1226  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1227  &(parent_task_info->task_data), &(parent_task_info->frame),
1228  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1229  codeptr);
1230  }
1231  }
1232 #endif // OMPT_SUPPORT
1233 
1234  if (this_thr->th.th_team != serial_team) {
1235  // Nested level will be an index in the nested nthreads array
1236  int level = this_thr->th.th_team->t.t_level;
1237 
1238  if (serial_team->t.t_serialized) {
1239  /* this serial team was already used
1240  TODO increase performance by making this locks more specific */
1241  kmp_team_t *new_team;
1242 
1243  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1244 
1245  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1246 #if OMPT_SUPPORT
1247  ompt_parallel_data,
1248 #endif
1249 #if OMP_40_ENABLED
1250  proc_bind,
1251 #endif
1252  &this_thr->th.th_current_task->td_icvs,
1253  0 USE_NESTED_HOT_ARG(NULL));
1254  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1255  KMP_ASSERT(new_team);
1256 
1257  /* setup new serialized team and install it */
1258  new_team->t.t_threads[0] = this_thr;
1259  new_team->t.t_parent = this_thr->th.th_team;
1260  serial_team = new_team;
1261  this_thr->th.th_serial_team = serial_team;
1262 
1263  KF_TRACE(
1264  10,
1265  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1266  global_tid, serial_team));
1267 
1268  /* TODO the above breaks the requirement that if we run out of resources,
1269  then we can still guarantee that serialized teams are ok, since we may
1270  need to allocate a new one */
1271  } else {
1272  KF_TRACE(
1273  10,
1274  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1275  global_tid, serial_team));
1276  }
1277 
1278  /* we have to initialize this serial team */
1279  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1280  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1281  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1282  serial_team->t.t_ident = loc;
1283  serial_team->t.t_serialized = 1;
1284  serial_team->t.t_nproc = 1;
1285  serial_team->t.t_parent = this_thr->th.th_team;
1286  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1287  this_thr->th.th_team = serial_team;
1288  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1289 
1290  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1291  this_thr->th.th_current_task));
1292  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1293  this_thr->th.th_current_task->td_flags.executing = 0;
1294 
1295  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1296 
1297  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1298  implicit task for each serialized task represented by
1299  team->t.t_serialized? */
1300  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1301  &this_thr->th.th_current_task->td_parent->td_icvs);
1302 
1303  // Thread value exists in the nested nthreads array for the next nested
1304  // level
1305  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1306  this_thr->th.th_current_task->td_icvs.nproc =
1307  __kmp_nested_nth.nth[level + 1];
1308  }
1309 
1310 #if OMP_40_ENABLED
1311  if (__kmp_nested_proc_bind.used &&
1312  (level + 1 < __kmp_nested_proc_bind.used)) {
1313  this_thr->th.th_current_task->td_icvs.proc_bind =
1314  __kmp_nested_proc_bind.bind_types[level + 1];
1315  }
1316 #endif /* OMP_40_ENABLED */
1317 
1318 #if USE_DEBUGGER
1319  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1320 #endif
1321  this_thr->th.th_info.ds.ds_tid = 0;
1322 
1323  /* set thread cache values */
1324  this_thr->th.th_team_nproc = 1;
1325  this_thr->th.th_team_master = this_thr;
1326  this_thr->th.th_team_serialized = 1;
1327 
1328  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1329  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1330 #if OMP_50_ENABLED
1331  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1332 #endif
1333 
1334  propagateFPControl(serial_team);
1335 
1336  /* check if we need to allocate dispatch buffers stack */
1337  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1338  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1339  serial_team->t.t_dispatch->th_disp_buffer =
1340  (dispatch_private_info_t *)__kmp_allocate(
1341  sizeof(dispatch_private_info_t));
1342  }
1343  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1344 
1345  KMP_MB();
1346 
1347  } else {
1348  /* this serialized team is already being used,
1349  * that's fine, just add another nested level */
1350  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1351  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1352  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1353  ++serial_team->t.t_serialized;
1354  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1355 
1356  // Nested level will be an index in the nested nthreads array
1357  int level = this_thr->th.th_team->t.t_level;
1358  // Thread value exists in the nested nthreads array for the next nested
1359  // level
1360  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1361  this_thr->th.th_current_task->td_icvs.nproc =
1362  __kmp_nested_nth.nth[level + 1];
1363  }
1364  serial_team->t.t_level++;
1365  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1366  "of serial team %p to %d\n",
1367  global_tid, serial_team, serial_team->t.t_level));
1368 
1369  /* allocate/push dispatch buffers stack */
1370  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1371  {
1372  dispatch_private_info_t *disp_buffer =
1373  (dispatch_private_info_t *)__kmp_allocate(
1374  sizeof(dispatch_private_info_t));
1375  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1376  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1377  }
1378  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1379 
1380  KMP_MB();
1381  }
1382 #if OMP_40_ENABLED
1383  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1384 #endif
1385 
1386 #if OMP_50_ENABLED
1387  // Perform the display affinity functionality for
1388  // serialized parallel regions
1389  if (__kmp_display_affinity) {
1390  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1391  this_thr->th.th_prev_num_threads != 1) {
1392  // NULL means use the affinity-format-var ICV
1393  __kmp_aux_display_affinity(global_tid, NULL);
1394  this_thr->th.th_prev_level = serial_team->t.t_level;
1395  this_thr->th.th_prev_num_threads = 1;
1396  }
1397  }
1398 #endif
1399 
1400  if (__kmp_env_consistency_check)
1401  __kmp_push_parallel(global_tid, NULL);
1402 #if OMPT_SUPPORT
1403  serial_team->t.ompt_team_info.master_return_address = codeptr;
1404  if (ompt_enabled.enabled &&
1405  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1406  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1407 
1408  ompt_lw_taskteam_t lw_taskteam;
1409  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1410  &ompt_parallel_data, codeptr);
1411 
1412  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1413  // don't use lw_taskteam after linking. content was swaped
1414 
1415  /* OMPT implicit task begin */
1416  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1417  if (ompt_enabled.ompt_callback_implicit_task) {
1418  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1419  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1420  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1421  OMPT_CUR_TASK_INFO(this_thr)
1422  ->thread_num = __kmp_tid_from_gtid(global_tid);
1423  }
1424 
1425  /* OMPT state */
1426  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1427  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1428  }
1429 #endif
1430 }
1431 
1432 /* most of the work for a fork */
1433 /* return true if we really went parallel, false if serialized */
1434 int __kmp_fork_call(ident_t *loc, int gtid,
1435  enum fork_context_e call_context, // Intel, GNU, ...
1436  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1437 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1438 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1439  va_list *ap
1440 #else
1441  va_list ap
1442 #endif
1443  ) {
1444  void **argv;
1445  int i;
1446  int master_tid;
1447  int master_this_cons;
1448  kmp_team_t *team;
1449  kmp_team_t *parent_team;
1450  kmp_info_t *master_th;
1451  kmp_root_t *root;
1452  int nthreads;
1453  int master_active;
1454  int master_set_numthreads;
1455  int level;
1456 #if OMP_40_ENABLED
1457  int active_level;
1458  int teams_level;
1459 #endif
1460 #if KMP_NESTED_HOT_TEAMS
1461  kmp_hot_team_ptr_t **p_hot_teams;
1462 #endif
1463  { // KMP_TIME_BLOCK
1464  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1465  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1466 
1467  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1468  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1469  /* Some systems prefer the stack for the root thread(s) to start with */
1470  /* some gap from the parent stack to prevent false sharing. */
1471  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1472  /* These 2 lines below are so this does not get optimized out */
1473  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1474  __kmp_stkpadding += (short)((kmp_int64)dummy);
1475  }
1476 
1477  /* initialize if needed */
1478  KMP_DEBUG_ASSERT(
1479  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1480  if (!TCR_4(__kmp_init_parallel))
1481  __kmp_parallel_initialize();
1482 
1483 #if OMP_50_ENABLED
1484  __kmp_resume_if_soft_paused();
1485 #endif
1486 
1487  /* setup current data */
1488  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1489  // shutdown
1490  parent_team = master_th->th.th_team;
1491  master_tid = master_th->th.th_info.ds.ds_tid;
1492  master_this_cons = master_th->th.th_local.this_construct;
1493  root = master_th->th.th_root;
1494  master_active = root->r.r_active;
1495  master_set_numthreads = master_th->th.th_set_nproc;
1496 
1497 #if OMPT_SUPPORT
1498  ompt_data_t ompt_parallel_data = ompt_data_none;
1499  ompt_data_t *parent_task_data;
1500  ompt_frame_t *ompt_frame;
1501  ompt_data_t *implicit_task_data;
1502  void *return_address = NULL;
1503 
1504  if (ompt_enabled.enabled) {
1505  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1506  NULL, NULL);
1507  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1508  }
1509 #endif
1510 
1511  // Nested level will be an index in the nested nthreads array
1512  level = parent_team->t.t_level;
1513  // used to launch non-serial teams even if nested is not allowed
1514  active_level = parent_team->t.t_active_level;
1515 #if OMP_40_ENABLED
1516  // needed to check nesting inside the teams
1517  teams_level = master_th->th.th_teams_level;
1518 #endif
1519 #if KMP_NESTED_HOT_TEAMS
1520  p_hot_teams = &master_th->th.th_hot_teams;
1521  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1522  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1523  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1524  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1525  // it is either actual or not needed (when active_level > 0)
1526  (*p_hot_teams)[0].hot_team_nth = 1;
1527  }
1528 #endif
1529 
1530 #if OMPT_SUPPORT
1531  if (ompt_enabled.enabled) {
1532  if (ompt_enabled.ompt_callback_parallel_begin) {
1533  int team_size = master_set_numthreads
1534  ? master_set_numthreads
1535  : get__nproc_2(parent_team, master_tid);
1536  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1537  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1538  OMPT_INVOKER(call_context), return_address);
1539  }
1540  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1541  }
1542 #endif
1543 
1544  master_th->th.th_ident = loc;
1545 
1546 #if OMP_40_ENABLED
1547  if (master_th->th.th_teams_microtask && ap &&
1548  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1549  // AC: This is start of parallel that is nested inside teams construct.
1550  // The team is actual (hot), all workers are ready at the fork barrier.
1551  // No lock needed to initialize the team a bit, then free workers.
1552  parent_team->t.t_ident = loc;
1553  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1554  parent_team->t.t_argc = argc;
1555  argv = (void **)parent_team->t.t_argv;
1556  for (i = argc - 1; i >= 0; --i)
1557 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1558 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1559  *argv++ = va_arg(*ap, void *);
1560 #else
1561  *argv++ = va_arg(ap, void *);
1562 #endif
1563  // Increment our nested depth levels, but not increase the serialization
1564  if (parent_team == master_th->th.th_serial_team) {
1565  // AC: we are in serialized parallel
1566  __kmpc_serialized_parallel(loc, gtid);
1567  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1568  // AC: need this in order enquiry functions work
1569  // correctly, will restore at join time
1570  parent_team->t.t_serialized--;
1571 #if OMPT_SUPPORT
1572  void *dummy;
1573  void **exit_runtime_p;
1574 
1575  ompt_lw_taskteam_t lw_taskteam;
1576 
1577  if (ompt_enabled.enabled) {
1578  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1579  &ompt_parallel_data, return_address);
1580  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1581 
1582  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1583  // don't use lw_taskteam after linking. content was swaped
1584 
1585  /* OMPT implicit task begin */
1586  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1587  if (ompt_enabled.ompt_callback_implicit_task) {
1588  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1589  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1590  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1591  OMPT_CUR_TASK_INFO(master_th)
1592  ->thread_num = __kmp_tid_from_gtid(gtid);
1593  }
1594 
1595  /* OMPT state */
1596  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1597  } else {
1598  exit_runtime_p = &dummy;
1599  }
1600 #endif
1601 
1602  {
1603  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1604  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1605  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1606 #if OMPT_SUPPORT
1607  ,
1608  exit_runtime_p
1609 #endif
1610  );
1611  }
1612 
1613 #if OMPT_SUPPORT
1614  *exit_runtime_p = NULL;
1615  if (ompt_enabled.enabled) {
1616  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1617  if (ompt_enabled.ompt_callback_implicit_task) {
1618  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1619  ompt_scope_end, NULL, implicit_task_data, 1,
1620  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1621  }
1622  __ompt_lw_taskteam_unlink(master_th);
1623 
1624  if (ompt_enabled.ompt_callback_parallel_end) {
1625  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1626  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1627  OMPT_INVOKER(call_context), return_address);
1628  }
1629  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1630  }
1631 #endif
1632  return TRUE;
1633  }
1634 
1635  parent_team->t.t_pkfn = microtask;
1636  parent_team->t.t_invoke = invoker;
1637  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1638  parent_team->t.t_active_level++;
1639  parent_team->t.t_level++;
1640 #if OMP_50_ENABLED
1641  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1642 #endif
1643 
1644  /* Change number of threads in the team if requested */
1645  if (master_set_numthreads) { // The parallel has num_threads clause
1646  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1647  // AC: only can reduce number of threads dynamically, can't increase
1648  kmp_info_t **other_threads = parent_team->t.t_threads;
1649  parent_team->t.t_nproc = master_set_numthreads;
1650  for (i = 0; i < master_set_numthreads; ++i) {
1651  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1652  }
1653  // Keep extra threads hot in the team for possible next parallels
1654  }
1655  master_th->th.th_set_nproc = 0;
1656  }
1657 
1658 #if USE_DEBUGGER
1659  if (__kmp_debugging) { // Let debugger override number of threads.
1660  int nth = __kmp_omp_num_threads(loc);
1661  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1662  master_set_numthreads = nth;
1663  }
1664  }
1665 #endif
1666 
1667  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1668  "master_th=%p, gtid=%d\n",
1669  root, parent_team, master_th, gtid));
1670  __kmp_internal_fork(loc, gtid, parent_team);
1671  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1672  "master_th=%p, gtid=%d\n",
1673  root, parent_team, master_th, gtid));
1674 
1675  /* Invoke microtask for MASTER thread */
1676  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1677  parent_team->t.t_id, parent_team->t.t_pkfn));
1678 
1679  if (!parent_team->t.t_invoke(gtid)) {
1680  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1681  }
1682  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1683  parent_team->t.t_id, parent_team->t.t_pkfn));
1684  KMP_MB(); /* Flush all pending memory write invalidates. */
1685 
1686  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1687 
1688  return TRUE;
1689  } // Parallel closely nested in teams construct
1690 #endif /* OMP_40_ENABLED */
1691 
1692 #if KMP_DEBUG
1693  if (__kmp_tasking_mode != tskm_immediate_exec) {
1694  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1695  parent_team->t.t_task_team[master_th->th.th_task_state]);
1696  }
1697 #endif
1698 
1699  if (parent_team->t.t_active_level >=
1700  master_th->th.th_current_task->td_icvs.max_active_levels) {
1701  nthreads = 1;
1702  } else {
1703 #if OMP_40_ENABLED
1704  int enter_teams = ((ap == NULL && active_level == 0) ||
1705  (ap && teams_level > 0 && teams_level == level));
1706 #endif
1707  nthreads =
1708  master_set_numthreads
1709  ? master_set_numthreads
1710  : get__nproc_2(
1711  parent_team,
1712  master_tid); // TODO: get nproc directly from current task
1713 
1714  // Check if we need to take forkjoin lock? (no need for serialized
1715  // parallel out of teams construct). This code moved here from
1716  // __kmp_reserve_threads() to speedup nested serialized parallels.
1717  if (nthreads > 1) {
1718  if ((get__max_active_levels(master_th) == 1 && (root->r.r_in_parallel
1719 #if OMP_40_ENABLED
1720  && !enter_teams
1721 #endif /* OMP_40_ENABLED */
1722  )) ||
1723  (__kmp_library == library_serial)) {
1724  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1725  " threads\n",
1726  gtid, nthreads));
1727  nthreads = 1;
1728  }
1729  }
1730  if (nthreads > 1) {
1731  /* determine how many new threads we can use */
1732  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1733  nthreads = __kmp_reserve_threads(
1734  root, parent_team, master_tid, nthreads
1735 #if OMP_40_ENABLED
1736  /* AC: If we execute teams from parallel region (on host), then
1737  teams should be created but each can only have 1 thread if
1738  nesting is disabled. If teams called from serial region, then
1739  teams and their threads should be created regardless of the
1740  nesting setting. */
1741  ,
1742  enter_teams
1743 #endif /* OMP_40_ENABLED */
1744  );
1745  if (nthreads == 1) {
1746  // Free lock for single thread execution here; for multi-thread
1747  // execution it will be freed later after team of threads created
1748  // and initialized
1749  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1750  }
1751  }
1752  }
1753  KMP_DEBUG_ASSERT(nthreads > 0);
1754 
1755  // If we temporarily changed the set number of threads then restore it now
1756  master_th->th.th_set_nproc = 0;
1757 
1758  /* create a serialized parallel region? */
1759  if (nthreads == 1) {
1760 /* josh todo: hypothetical question: what do we do for OS X*? */
1761 #if KMP_OS_LINUX && \
1762  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1763  void *args[argc];
1764 #else
1765  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1766 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1767  KMP_ARCH_AARCH64) */
1768 
1769  KA_TRACE(20,
1770  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1771 
1772  __kmpc_serialized_parallel(loc, gtid);
1773 
1774  if (call_context == fork_context_intel) {
1775  /* TODO this sucks, use the compiler itself to pass args! :) */
1776  master_th->th.th_serial_team->t.t_ident = loc;
1777 #if OMP_40_ENABLED
1778  if (!ap) {
1779  // revert change made in __kmpc_serialized_parallel()
1780  master_th->th.th_serial_team->t.t_level--;
1781 // Get args from parent team for teams construct
1782 
1783 #if OMPT_SUPPORT
1784  void *dummy;
1785  void **exit_runtime_p;
1786  ompt_task_info_t *task_info;
1787 
1788  ompt_lw_taskteam_t lw_taskteam;
1789 
1790  if (ompt_enabled.enabled) {
1791  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1792  &ompt_parallel_data, return_address);
1793 
1794  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1795  // don't use lw_taskteam after linking. content was swaped
1796 
1797  task_info = OMPT_CUR_TASK_INFO(master_th);
1798  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1799  if (ompt_enabled.ompt_callback_implicit_task) {
1800  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1801  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1802  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1803  OMPT_CUR_TASK_INFO(master_th)
1804  ->thread_num = __kmp_tid_from_gtid(gtid);
1805  }
1806 
1807  /* OMPT state */
1808  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1809  } else {
1810  exit_runtime_p = &dummy;
1811  }
1812 #endif
1813 
1814  {
1815  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1816  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1817  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1818  parent_team->t.t_argv
1819 #if OMPT_SUPPORT
1820  ,
1821  exit_runtime_p
1822 #endif
1823  );
1824  }
1825 
1826 #if OMPT_SUPPORT
1827  if (ompt_enabled.enabled) {
1828  exit_runtime_p = NULL;
1829  if (ompt_enabled.ompt_callback_implicit_task) {
1830  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1831  ompt_scope_end, NULL, &(task_info->task_data), 1,
1832  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1833  }
1834 
1835  __ompt_lw_taskteam_unlink(master_th);
1836  if (ompt_enabled.ompt_callback_parallel_end) {
1837  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1838  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1839  OMPT_INVOKER(call_context), return_address);
1840  }
1841  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1842  }
1843 #endif
1844  } else if (microtask == (microtask_t)__kmp_teams_master) {
1845  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1846  master_th->th.th_serial_team);
1847  team = master_th->th.th_team;
1848  // team->t.t_pkfn = microtask;
1849  team->t.t_invoke = invoker;
1850  __kmp_alloc_argv_entries(argc, team, TRUE);
1851  team->t.t_argc = argc;
1852  argv = (void **)team->t.t_argv;
1853  if (ap) {
1854  for (i = argc - 1; i >= 0; --i)
1855 // TODO: revert workaround for Intel(R) 64 tracker #96
1856 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1857  *argv++ = va_arg(*ap, void *);
1858 #else
1859  *argv++ = va_arg(ap, void *);
1860 #endif
1861  } else {
1862  for (i = 0; i < argc; ++i)
1863  // Get args from parent team for teams construct
1864  argv[i] = parent_team->t.t_argv[i];
1865  }
1866  // AC: revert change made in __kmpc_serialized_parallel()
1867  // because initial code in teams should have level=0
1868  team->t.t_level--;
1869  // AC: call special invoker for outer "parallel" of teams construct
1870  invoker(gtid);
1871  } else {
1872 #endif /* OMP_40_ENABLED */
1873  argv = args;
1874  for (i = argc - 1; i >= 0; --i)
1875 // TODO: revert workaround for Intel(R) 64 tracker #96
1876 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1877  *argv++ = va_arg(*ap, void *);
1878 #else
1879  *argv++ = va_arg(ap, void *);
1880 #endif
1881  KMP_MB();
1882 
1883 #if OMPT_SUPPORT
1884  void *dummy;
1885  void **exit_runtime_p;
1886  ompt_task_info_t *task_info;
1887 
1888  ompt_lw_taskteam_t lw_taskteam;
1889 
1890  if (ompt_enabled.enabled) {
1891  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1892  &ompt_parallel_data, return_address);
1893  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1894  // don't use lw_taskteam after linking. content was swaped
1895  task_info = OMPT_CUR_TASK_INFO(master_th);
1896  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1897 
1898  /* OMPT implicit task begin */
1899  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1900  if (ompt_enabled.ompt_callback_implicit_task) {
1901  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1902  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1903  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1904  OMPT_CUR_TASK_INFO(master_th)
1905  ->thread_num = __kmp_tid_from_gtid(gtid);
1906  }
1907 
1908  /* OMPT state */
1909  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910  } else {
1911  exit_runtime_p = &dummy;
1912  }
1913 #endif
1914 
1915  {
1916  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920  ,
1921  exit_runtime_p
1922 #endif
1923  );
1924  }
1925 
1926 #if OMPT_SUPPORT
1927  if (ompt_enabled.enabled) {
1928  *exit_runtime_p = NULL;
1929  if (ompt_enabled.ompt_callback_implicit_task) {
1930  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931  ompt_scope_end, NULL, &(task_info->task_data), 1,
1932  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1933  }
1934 
1935  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1936  __ompt_lw_taskteam_unlink(master_th);
1937  if (ompt_enabled.ompt_callback_parallel_end) {
1938  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1939  &ompt_parallel_data, parent_task_data,
1940  OMPT_INVOKER(call_context), return_address);
1941  }
1942  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1943  }
1944 #endif
1945 #if OMP_40_ENABLED
1946  }
1947 #endif /* OMP_40_ENABLED */
1948  } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950  ompt_lw_taskteam_t lwt;
1951  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952  return_address);
1953 
1954  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959  // we were called from GNU native code
1960  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961  return FALSE;
1962  } else {
1963  KMP_ASSERT2(call_context < fork_context_last,
1964  "__kmp_fork_call: unknown fork_context parameter");
1965  }
1966 
1967  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968  KMP_MB();
1969  return FALSE;
1970  } // if (nthreads == 1)
1971 
1972  // GEH: only modify the executing flag in the case when not serialized
1973  // serialized case is handled in kmpc_serialized_parallel
1974  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975  "curtask=%p, curtask_max_aclevel=%d\n",
1976  parent_team->t.t_active_level, master_th,
1977  master_th->th.th_current_task,
1978  master_th->th.th_current_task->td_icvs.max_active_levels));
1979  // TODO: GEH - cannot do this assertion because root thread not set up as
1980  // executing
1981  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982  master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984 #if OMP_40_ENABLED
1985  if (!master_th->th.th_teams_microtask || level > teams_level)
1986 #endif /* OMP_40_ENABLED */
1987  {
1988  /* Increment our nested depth level */
1989  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1990  }
1991 
1992  // See if we need to make a copy of the ICVs.
1993  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1994  if ((level + 1 < __kmp_nested_nth.used) &&
1995  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1996  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1997  } else {
1998  nthreads_icv = 0; // don't update
1999  }
2000 
2001 #if OMP_40_ENABLED
2002  // Figure out the proc_bind_policy for the new team.
2003  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2004  kmp_proc_bind_t proc_bind_icv =
2005  proc_bind_default; // proc_bind_default means don't update
2006  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2007  proc_bind = proc_bind_false;
2008  } else {
2009  if (proc_bind == proc_bind_default) {
2010  // No proc_bind clause specified; use current proc-bind-var for this
2011  // parallel region
2012  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2013  }
2014  /* else: The proc_bind policy was specified explicitly on parallel clause.
2015  This overrides proc-bind-var for this parallel region, but does not
2016  change proc-bind-var. */
2017  // Figure the value of proc-bind-var for the child threads.
2018  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2019  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2020  master_th->th.th_current_task->td_icvs.proc_bind)) {
2021  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2022  }
2023  }
2024 
2025  // Reset for next parallel region
2026  master_th->th.th_set_proc_bind = proc_bind_default;
2027 #endif /* OMP_40_ENABLED */
2028 
2029  if ((nthreads_icv > 0)
2030 #if OMP_40_ENABLED
2031  || (proc_bind_icv != proc_bind_default)
2032 #endif /* OMP_40_ENABLED */
2033  ) {
2034  kmp_internal_control_t new_icvs;
2035  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036  new_icvs.next = NULL;
2037  if (nthreads_icv > 0) {
2038  new_icvs.nproc = nthreads_icv;
2039  }
2040 
2041 #if OMP_40_ENABLED
2042  if (proc_bind_icv != proc_bind_default) {
2043  new_icvs.proc_bind = proc_bind_icv;
2044  }
2045 #endif /* OMP_40_ENABLED */
2046 
2047  /* allocate a new parallel team */
2048  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2049  team = __kmp_allocate_team(root, nthreads, nthreads,
2050 #if OMPT_SUPPORT
2051  ompt_parallel_data,
2052 #endif
2053 #if OMP_40_ENABLED
2054  proc_bind,
2055 #endif
2056  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2057  } else {
2058  /* allocate a new parallel team */
2059  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2060  team = __kmp_allocate_team(root, nthreads, nthreads,
2061 #if OMPT_SUPPORT
2062  ompt_parallel_data,
2063 #endif
2064 #if OMP_40_ENABLED
2065  proc_bind,
2066 #endif
2067  &master_th->th.th_current_task->td_icvs,
2068  argc USE_NESTED_HOT_ARG(master_th));
2069  }
2070  KF_TRACE(
2071  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2072 
2073  /* setup the new team */
2074  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2075  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2076  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2077  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2078  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2079 #if OMPT_SUPPORT
2080  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2081  return_address);
2082 #endif
2083  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2084 // TODO: parent_team->t.t_level == INT_MAX ???
2085 #if OMP_40_ENABLED
2086  if (!master_th->th.th_teams_microtask || level > teams_level) {
2087 #endif /* OMP_40_ENABLED */
2088  int new_level = parent_team->t.t_level + 1;
2089  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2090  new_level = parent_team->t.t_active_level + 1;
2091  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2092 #if OMP_40_ENABLED
2093  } else {
2094  // AC: Do not increase parallel level at start of the teams construct
2095  int new_level = parent_team->t.t_level;
2096  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2097  new_level = parent_team->t.t_active_level;
2098  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2099  }
2100 #endif /* OMP_40_ENABLED */
2101  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2102  // set master's schedule as new run-time schedule
2103  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2104 
2105 #if OMP_40_ENABLED
2106  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2107 #endif
2108 #if OMP_50_ENABLED
2109  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2110 #endif
2111 
2112  // Update the floating point rounding in the team if required.
2113  propagateFPControl(team);
2114 
2115  if (__kmp_tasking_mode != tskm_immediate_exec) {
2116  // Set master's task team to team's task team. Unless this is hot team, it
2117  // should be NULL.
2118  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2119  parent_team->t.t_task_team[master_th->th.th_task_state]);
2120  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2121  "%p, new task_team %p / team %p\n",
2122  __kmp_gtid_from_thread(master_th),
2123  master_th->th.th_task_team, parent_team,
2124  team->t.t_task_team[master_th->th.th_task_state], team));
2125 
2126  if (active_level || master_th->th.th_task_team) {
2127  // Take a memo of master's task_state
2128  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2129  if (master_th->th.th_task_state_top >=
2130  master_th->th.th_task_state_stack_sz) { // increase size
2131  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2132  kmp_uint8 *old_stack, *new_stack;
2133  kmp_uint32 i;
2134  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2135  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2136  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2137  }
2138  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2139  ++i) { // zero-init rest of stack
2140  new_stack[i] = 0;
2141  }
2142  old_stack = master_th->th.th_task_state_memo_stack;
2143  master_th->th.th_task_state_memo_stack = new_stack;
2144  master_th->th.th_task_state_stack_sz = new_size;
2145  __kmp_free(old_stack);
2146  }
2147  // Store master's task_state on stack
2148  master_th->th
2149  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2150  master_th->th.th_task_state;
2151  master_th->th.th_task_state_top++;
2152 #if KMP_NESTED_HOT_TEAMS
2153  if (master_th->th.th_hot_teams &&
2154  active_level < __kmp_hot_teams_max_level &&
2155  team == master_th->th.th_hot_teams[active_level].hot_team) {
2156  // Restore master's nested state if nested hot team
2157  master_th->th.th_task_state =
2158  master_th->th
2159  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2160  } else {
2161 #endif
2162  master_th->th.th_task_state = 0;
2163 #if KMP_NESTED_HOT_TEAMS
2164  }
2165 #endif
2166  }
2167 #if !KMP_NESTED_HOT_TEAMS
2168  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2169  (team == root->r.r_hot_team));
2170 #endif
2171  }
2172 
2173  KA_TRACE(
2174  20,
2175  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2176  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2177  team->t.t_nproc));
2178  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2179  (team->t.t_master_tid == 0 &&
2180  (team->t.t_parent == root->r.r_root_team ||
2181  team->t.t_parent->t.t_serialized)));
2182  KMP_MB();
2183 
2184  /* now, setup the arguments */
2185  argv = (void **)team->t.t_argv;
2186 #if OMP_40_ENABLED
2187  if (ap) {
2188 #endif /* OMP_40_ENABLED */
2189  for (i = argc - 1; i >= 0; --i) {
2190 // TODO: revert workaround for Intel(R) 64 tracker #96
2191 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2192  void *new_argv = va_arg(*ap, void *);
2193 #else
2194  void *new_argv = va_arg(ap, void *);
2195 #endif
2196  KMP_CHECK_UPDATE(*argv, new_argv);
2197  argv++;
2198  }
2199 #if OMP_40_ENABLED
2200  } else {
2201  for (i = 0; i < argc; ++i) {
2202  // Get args from parent team for teams construct
2203  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2204  }
2205  }
2206 #endif /* OMP_40_ENABLED */
2207 
2208  /* now actually fork the threads */
2209  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2210  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2211  root->r.r_active = TRUE;
2212 
2213  __kmp_fork_team_threads(root, team, master_th, gtid);
2214  __kmp_setup_icv_copy(team, nthreads,
2215  &master_th->th.th_current_task->td_icvs, loc);
2216 
2217 #if OMPT_SUPPORT
2218  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2219 #endif
2220 
2221  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2222 
2223 #if USE_ITT_BUILD
2224  if (team->t.t_active_level == 1 // only report frames at level 1
2225 #if OMP_40_ENABLED
2226  && !master_th->th.th_teams_microtask // not in teams construct
2227 #endif /* OMP_40_ENABLED */
2228  ) {
2229 #if USE_ITT_NOTIFY
2230  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2231  (__kmp_forkjoin_frames_mode == 3 ||
2232  __kmp_forkjoin_frames_mode == 1)) {
2233  kmp_uint64 tmp_time = 0;
2234  if (__itt_get_timestamp_ptr)
2235  tmp_time = __itt_get_timestamp();
2236  // Internal fork - report frame begin
2237  master_th->th.th_frame_time = tmp_time;
2238  if (__kmp_forkjoin_frames_mode == 3)
2239  team->t.t_region_time = tmp_time;
2240  } else
2241 // only one notification scheme (either "submit" or "forking/joined", not both)
2242 #endif /* USE_ITT_NOTIFY */
2243  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2244  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2245  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2246  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2247  }
2248  }
2249 #endif /* USE_ITT_BUILD */
2250 
2251  /* now go on and do the work */
2252  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2253  KMP_MB();
2254  KF_TRACE(10,
2255  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2256  root, team, master_th, gtid));
2257 
2258 #if USE_ITT_BUILD
2259  if (__itt_stack_caller_create_ptr) {
2260  team->t.t_stack_id =
2261  __kmp_itt_stack_caller_create(); // create new stack stitching id
2262  // before entering fork barrier
2263  }
2264 #endif /* USE_ITT_BUILD */
2265 
2266 #if OMP_40_ENABLED
2267  // AC: skip __kmp_internal_fork at teams construct, let only master
2268  // threads execute
2269  if (ap)
2270 #endif /* OMP_40_ENABLED */
2271  {
2272  __kmp_internal_fork(loc, gtid, team);
2273  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2274  "master_th=%p, gtid=%d\n",
2275  root, team, master_th, gtid));
2276  }
2277 
2278  if (call_context == fork_context_gnu) {
2279  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2280  return TRUE;
2281  }
2282 
2283  /* Invoke microtask for MASTER thread */
2284  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2285  team->t.t_id, team->t.t_pkfn));
2286  } // END of timer KMP_fork_call block
2287 
2288 #if KMP_STATS_ENABLED && OMP_40_ENABLED
2289  // If beginning a teams construct, then change thread state
2290  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2291  if (!ap) {
2292  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2293  }
2294 #endif
2295 
2296  if (!team->t.t_invoke(gtid)) {
2297  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2298  }
2299 
2300 #if KMP_STATS_ENABLED && OMP_40_ENABLED
2301  // If was beginning of a teams construct, then reset thread state
2302  if (!ap) {
2303  KMP_SET_THREAD_STATE(previous_state);
2304  }
2305 #endif
2306 
2307  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2308  team->t.t_id, team->t.t_pkfn));
2309  KMP_MB(); /* Flush all pending memory write invalidates. */
2310 
2311  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2312 
2313 #if OMPT_SUPPORT
2314  if (ompt_enabled.enabled) {
2315  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2316  }
2317 #endif
2318 
2319  return TRUE;
2320 }
2321 
2322 #if OMPT_SUPPORT
2323 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2324  kmp_team_t *team) {
2325  // restore state outside the region
2326  thread->th.ompt_thread_info.state =
2327  ((team->t.t_serialized) ? ompt_state_work_serial
2328  : ompt_state_work_parallel);
2329 }
2330 
2331 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2332  kmp_team_t *team, ompt_data_t *parallel_data,
2333  fork_context_e fork_context, void *codeptr) {
2334  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2335  if (ompt_enabled.ompt_callback_parallel_end) {
2336  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2337  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2338  codeptr);
2339  }
2340 
2341  task_info->frame.enter_frame = ompt_data_none;
2342  __kmp_join_restore_state(thread, team);
2343 }
2344 #endif
2345 
2346 void __kmp_join_call(ident_t *loc, int gtid
2347 #if OMPT_SUPPORT
2348  ,
2349  enum fork_context_e fork_context
2350 #endif
2351 #if OMP_40_ENABLED
2352  ,
2353  int exit_teams
2354 #endif /* OMP_40_ENABLED */
2355  ) {
2356  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2357  kmp_team_t *team;
2358  kmp_team_t *parent_team;
2359  kmp_info_t *master_th;
2360  kmp_root_t *root;
2361  int master_active;
2362 
2363  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2364 
2365  /* setup current data */
2366  master_th = __kmp_threads[gtid];
2367  root = master_th->th.th_root;
2368  team = master_th->th.th_team;
2369  parent_team = team->t.t_parent;
2370 
2371  master_th->th.th_ident = loc;
2372 
2373 #if OMPT_SUPPORT
2374  if (ompt_enabled.enabled) {
2375  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2376  }
2377 #endif
2378 
2379 #if KMP_DEBUG
2380  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2381  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2382  "th_task_team = %p\n",
2383  __kmp_gtid_from_thread(master_th), team,
2384  team->t.t_task_team[master_th->th.th_task_state],
2385  master_th->th.th_task_team));
2386  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2387  team->t.t_task_team[master_th->th.th_task_state]);
2388  }
2389 #endif
2390 
2391  if (team->t.t_serialized) {
2392 #if OMP_40_ENABLED
2393  if (master_th->th.th_teams_microtask) {
2394  // We are in teams construct
2395  int level = team->t.t_level;
2396  int tlevel = master_th->th.th_teams_level;
2397  if (level == tlevel) {
2398  // AC: we haven't incremented it earlier at start of teams construct,
2399  // so do it here - at the end of teams construct
2400  team->t.t_level++;
2401  } else if (level == tlevel + 1) {
2402  // AC: we are exiting parallel inside teams, need to increment
2403  // serialization in order to restore it in the next call to
2404  // __kmpc_end_serialized_parallel
2405  team->t.t_serialized++;
2406  }
2407  }
2408 #endif /* OMP_40_ENABLED */
2409  __kmpc_end_serialized_parallel(loc, gtid);
2410 
2411 #if OMPT_SUPPORT
2412  if (ompt_enabled.enabled) {
2413  __kmp_join_restore_state(master_th, parent_team);
2414  }
2415 #endif
2416 
2417  return;
2418  }
2419 
2420  master_active = team->t.t_master_active;
2421 
2422 #if OMP_40_ENABLED
2423  if (!exit_teams)
2424 #endif /* OMP_40_ENABLED */
2425  {
2426  // AC: No barrier for internal teams at exit from teams construct.
2427  // But there is barrier for external team (league).
2428  __kmp_internal_join(loc, gtid, team);
2429  }
2430 #if OMP_40_ENABLED
2431  else {
2432  master_th->th.th_task_state =
2433  0; // AC: no tasking in teams (out of any parallel)
2434  }
2435 #endif /* OMP_40_ENABLED */
2436 
2437  KMP_MB();
2438 
2439 #if OMPT_SUPPORT
2440  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2441  void *codeptr = team->t.ompt_team_info.master_return_address;
2442 #endif
2443 
2444 #if USE_ITT_BUILD
2445  if (__itt_stack_caller_create_ptr) {
2446  __kmp_itt_stack_caller_destroy(
2447  (__itt_caller)team->t
2448  .t_stack_id); // destroy the stack stitching id after join barrier
2449  }
2450 
2451  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2452  if (team->t.t_active_level == 1
2453 #if OMP_40_ENABLED
2454  && !master_th->th.th_teams_microtask /* not in teams construct */
2455 #endif /* OMP_40_ENABLED */
2456  ) {
2457  master_th->th.th_ident = loc;
2458  // only one notification scheme (either "submit" or "forking/joined", not
2459  // both)
2460  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2461  __kmp_forkjoin_frames_mode == 3)
2462  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2463  master_th->th.th_frame_time, 0, loc,
2464  master_th->th.th_team_nproc, 1);
2465  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2466  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2467  __kmp_itt_region_joined(gtid);
2468  } // active_level == 1
2469 #endif /* USE_ITT_BUILD */
2470 
2471 #if OMP_40_ENABLED
2472  if (master_th->th.th_teams_microtask && !exit_teams &&
2473  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2474  team->t.t_level == master_th->th.th_teams_level + 1) {
2475  // AC: We need to leave the team structure intact at the end of parallel
2476  // inside the teams construct, so that at the next parallel same (hot) team
2477  // works, only adjust nesting levels
2478 
2479  /* Decrement our nested depth level */
2480  team->t.t_level--;
2481  team->t.t_active_level--;
2482  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2483 
2484  // Restore number of threads in the team if needed. This code relies on
2485  // the proper adjustment of th_teams_size.nth after the fork in
2486  // __kmp_teams_master on each teams master in the case that
2487  // __kmp_reserve_threads reduced it.
2488  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2489  int old_num = master_th->th.th_team_nproc;
2490  int new_num = master_th->th.th_teams_size.nth;
2491  kmp_info_t **other_threads = team->t.t_threads;
2492  team->t.t_nproc = new_num;
2493  for (int i = 0; i < old_num; ++i) {
2494  other_threads[i]->th.th_team_nproc = new_num;
2495  }
2496  // Adjust states of non-used threads of the team
2497  for (int i = old_num; i < new_num; ++i) {
2498  // Re-initialize thread's barrier data.
2499  KMP_DEBUG_ASSERT(other_threads[i]);
2500  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2501  for (int b = 0; b < bs_last_barrier; ++b) {
2502  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2503  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2504 #if USE_DEBUGGER
2505  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2506 #endif
2507  }
2508  if (__kmp_tasking_mode != tskm_immediate_exec) {
2509  // Synchronize thread's task state
2510  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2511  }
2512  }
2513  }
2514 
2515 #if OMPT_SUPPORT
2516  if (ompt_enabled.enabled) {
2517  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2518  codeptr);
2519  }
2520 #endif
2521 
2522  return;
2523  }
2524 #endif /* OMP_40_ENABLED */
2525 
2526  /* do cleanup and restore the parent team */
2527  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2528  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2529 
2530  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2531 
2532  /* jc: The following lock has instructions with REL and ACQ semantics,
2533  separating the parallel user code called in this parallel region
2534  from the serial user code called after this function returns. */
2535  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2536 
2537 #if OMP_40_ENABLED
2538  if (!master_th->th.th_teams_microtask ||
2539  team->t.t_level > master_th->th.th_teams_level)
2540 #endif /* OMP_40_ENABLED */
2541  {
2542  /* Decrement our nested depth level */
2543  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2544  }
2545  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2546 
2547 #if OMPT_SUPPORT
2548  if (ompt_enabled.enabled) {
2549  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2550  if (ompt_enabled.ompt_callback_implicit_task) {
2551  int ompt_team_size = team->t.t_nproc;
2552  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2553  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2554  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2555  }
2556 
2557  task_info->frame.exit_frame = ompt_data_none;
2558  task_info->task_data = ompt_data_none;
2559  }
2560 #endif
2561 
2562  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2563  master_th, team));
2564  __kmp_pop_current_task_from_thread(master_th);
2565 
2566 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2567  // Restore master thread's partition.
2568  master_th->th.th_first_place = team->t.t_first_place;
2569  master_th->th.th_last_place = team->t.t_last_place;
2570 #endif /* OMP_40_ENABLED */
2571 #if OMP_50_ENABLED
2572  master_th->th.th_def_allocator = team->t.t_def_allocator;
2573 #endif
2574 
2575  updateHWFPControl(team);
2576 
2577  if (root->r.r_active != master_active)
2578  root->r.r_active = master_active;
2579 
2580  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2581  master_th)); // this will free worker threads
2582 
2583  /* this race was fun to find. make sure the following is in the critical
2584  region otherwise assertions may fail occasionally since the old team may be
2585  reallocated and the hierarchy appears inconsistent. it is actually safe to
2586  run and won't cause any bugs, but will cause those assertion failures. it's
2587  only one deref&assign so might as well put this in the critical region */
2588  master_th->th.th_team = parent_team;
2589  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2590  master_th->th.th_team_master = parent_team->t.t_threads[0];
2591  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2592 
2593  /* restore serialized team, if need be */
2594  if (parent_team->t.t_serialized &&
2595  parent_team != master_th->th.th_serial_team &&
2596  parent_team != root->r.r_root_team) {
2597  __kmp_free_team(root,
2598  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2599  master_th->th.th_serial_team = parent_team;
2600  }
2601 
2602  if (__kmp_tasking_mode != tskm_immediate_exec) {
2603  if (master_th->th.th_task_state_top >
2604  0) { // Restore task state from memo stack
2605  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2606  // Remember master's state if we re-use this nested hot team
2607  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2608  master_th->th.th_task_state;
2609  --master_th->th.th_task_state_top; // pop
2610  // Now restore state at this level
2611  master_th->th.th_task_state =
2612  master_th->th
2613  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2614  }
2615  // Copy the task team from the parent team to the master thread
2616  master_th->th.th_task_team =
2617  parent_team->t.t_task_team[master_th->th.th_task_state];
2618  KA_TRACE(20,
2619  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2620  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2621  parent_team));
2622  }
2623 
2624  // TODO: GEH - cannot do this assertion because root thread not set up as
2625  // executing
2626  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2627  master_th->th.th_current_task->td_flags.executing = 1;
2628 
2629  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2630 
2631 #if OMPT_SUPPORT
2632  if (ompt_enabled.enabled) {
2633  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2634  codeptr);
2635  }
2636 #endif
2637 
2638  KMP_MB();
2639  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2640 }
2641 
2642 /* Check whether we should push an internal control record onto the
2643  serial team stack. If so, do it. */
2644 void __kmp_save_internal_controls(kmp_info_t *thread) {
2645 
2646  if (thread->th.th_team != thread->th.th_serial_team) {
2647  return;
2648  }
2649  if (thread->th.th_team->t.t_serialized > 1) {
2650  int push = 0;
2651 
2652  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2653  push = 1;
2654  } else {
2655  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2656  thread->th.th_team->t.t_serialized) {
2657  push = 1;
2658  }
2659  }
2660  if (push) { /* push a record on the serial team's stack */
2661  kmp_internal_control_t *control =
2662  (kmp_internal_control_t *)__kmp_allocate(
2663  sizeof(kmp_internal_control_t));
2664 
2665  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2666 
2667  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2668 
2669  control->next = thread->th.th_team->t.t_control_stack_top;
2670  thread->th.th_team->t.t_control_stack_top = control;
2671  }
2672  }
2673 }
2674 
2675 /* Changes set_nproc */
2676 void __kmp_set_num_threads(int new_nth, int gtid) {
2677  kmp_info_t *thread;
2678  kmp_root_t *root;
2679 
2680  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2681  KMP_DEBUG_ASSERT(__kmp_init_serial);
2682 
2683  if (new_nth < 1)
2684  new_nth = 1;
2685  else if (new_nth > __kmp_max_nth)
2686  new_nth = __kmp_max_nth;
2687 
2688  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2689  thread = __kmp_threads[gtid];
2690  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2691  return; // nothing to do
2692 
2693  __kmp_save_internal_controls(thread);
2694 
2695  set__nproc(thread, new_nth);
2696 
2697  // If this omp_set_num_threads() call will cause the hot team size to be
2698  // reduced (in the absence of a num_threads clause), then reduce it now,
2699  // rather than waiting for the next parallel region.
2700  root = thread->th.th_root;
2701  if (__kmp_init_parallel && (!root->r.r_active) &&
2702  (root->r.r_hot_team->t.t_nproc > new_nth)
2703 #if KMP_NESTED_HOT_TEAMS
2704  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2705 #endif
2706  ) {
2707  kmp_team_t *hot_team = root->r.r_hot_team;
2708  int f;
2709 
2710  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2711 
2712  // Release the extra threads we don't need any more.
2713  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2714  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2715  if (__kmp_tasking_mode != tskm_immediate_exec) {
2716  // When decreasing team size, threads no longer in the team should unref
2717  // task team.
2718  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2719  }
2720  __kmp_free_thread(hot_team->t.t_threads[f]);
2721  hot_team->t.t_threads[f] = NULL;
2722  }
2723  hot_team->t.t_nproc = new_nth;
2724 #if KMP_NESTED_HOT_TEAMS
2725  if (thread->th.th_hot_teams) {
2726  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2727  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2728  }
2729 #endif
2730 
2731  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732 
2733  // Update the t_nproc field in the threads that are still active.
2734  for (f = 0; f < new_nth; f++) {
2735  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2736  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2737  }
2738  // Special flag in case omp_set_num_threads() call
2739  hot_team->t.t_size_changed = -1;
2740  }
2741 }
2742 
2743 /* Changes max_active_levels */
2744 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2745  kmp_info_t *thread;
2746 
2747  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2748  "%d = (%d)\n",
2749  gtid, max_active_levels));
2750  KMP_DEBUG_ASSERT(__kmp_init_serial);
2751 
2752  // validate max_active_levels
2753  if (max_active_levels < 0) {
2754  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2755  // We ignore this call if the user has specified a negative value.
2756  // The current setting won't be changed. The last valid setting will be
2757  // used. A warning will be issued (if warnings are allowed as controlled by
2758  // the KMP_WARNINGS env var).
2759  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2760  "max_active_levels for thread %d = (%d)\n",
2761  gtid, max_active_levels));
2762  return;
2763  }
2764  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2765  // it's OK, the max_active_levels is within the valid range: [ 0;
2766  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2767  // We allow a zero value. (implementation defined behavior)
2768  } else {
2769  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2770  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2771  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2772  // Current upper limit is MAX_INT. (implementation defined behavior)
2773  // If the input exceeds the upper limit, we correct the input to be the
2774  // upper limit. (implementation defined behavior)
2775  // Actually, the flow should never get here until we use MAX_INT limit.
2776  }
2777  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2778  "max_active_levels for thread %d = (%d)\n",
2779  gtid, max_active_levels));
2780 
2781  thread = __kmp_threads[gtid];
2782 
2783  __kmp_save_internal_controls(thread);
2784 
2785  set__max_active_levels(thread, max_active_levels);
2786 }
2787 
2788 /* Gets max_active_levels */
2789 int __kmp_get_max_active_levels(int gtid) {
2790  kmp_info_t *thread;
2791 
2792  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2793  KMP_DEBUG_ASSERT(__kmp_init_serial);
2794 
2795  thread = __kmp_threads[gtid];
2796  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2797  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2798  "curtask_maxaclevel=%d\n",
2799  gtid, thread->th.th_current_task,
2800  thread->th.th_current_task->td_icvs.max_active_levels));
2801  return thread->th.th_current_task->td_icvs.max_active_levels;
2802 }
2803 
2804 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2805 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2806  kmp_info_t *thread;
2807  // kmp_team_t *team;
2808 
2809  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2810  gtid, (int)kind, chunk));
2811  KMP_DEBUG_ASSERT(__kmp_init_serial);
2812 
2813  // Check if the kind parameter is valid, correct if needed.
2814  // Valid parameters should fit in one of two intervals - standard or extended:
2815  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2816  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2817  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2818  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2819  // TODO: Hint needs attention in case we change the default schedule.
2820  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2821  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2822  __kmp_msg_null);
2823  kind = kmp_sched_default;
2824  chunk = 0; // ignore chunk value in case of bad kind
2825  }
2826 
2827  thread = __kmp_threads[gtid];
2828 
2829  __kmp_save_internal_controls(thread);
2830 
2831  if (kind < kmp_sched_upper_std) {
2832  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2833  // differ static chunked vs. unchunked: chunk should be invalid to
2834  // indicate unchunked schedule (which is the default)
2835  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2836  } else {
2837  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2838  __kmp_sch_map[kind - kmp_sched_lower - 1];
2839  }
2840  } else {
2841  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2842  // kmp_sched_lower - 2 ];
2843  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2844  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2845  kmp_sched_lower - 2];
2846  }
2847  if (kind == kmp_sched_auto || chunk < 1) {
2848  // ignore parameter chunk for schedule auto
2849  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2850  } else {
2851  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2852  }
2853 }
2854 
2855 /* Gets def_sched_var ICV values */
2856 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2857  kmp_info_t *thread;
2858  enum sched_type th_type;
2859 
2860  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2861  KMP_DEBUG_ASSERT(__kmp_init_serial);
2862 
2863  thread = __kmp_threads[gtid];
2864 
2865  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2866 
2867  switch (th_type) {
2868  case kmp_sch_static:
2869  case kmp_sch_static_greedy:
2870  case kmp_sch_static_balanced:
2871  *kind = kmp_sched_static;
2872  *chunk = 0; // chunk was not set, try to show this fact via zero value
2873  return;
2874  case kmp_sch_static_chunked:
2875  *kind = kmp_sched_static;
2876  break;
2877  case kmp_sch_dynamic_chunked:
2878  *kind = kmp_sched_dynamic;
2879  break;
2881  case kmp_sch_guided_iterative_chunked:
2882  case kmp_sch_guided_analytical_chunked:
2883  *kind = kmp_sched_guided;
2884  break;
2885  case kmp_sch_auto:
2886  *kind = kmp_sched_auto;
2887  break;
2888  case kmp_sch_trapezoidal:
2889  *kind = kmp_sched_trapezoidal;
2890  break;
2891 #if KMP_STATIC_STEAL_ENABLED
2892  case kmp_sch_static_steal:
2893  *kind = kmp_sched_static_steal;
2894  break;
2895 #endif
2896  default:
2897  KMP_FATAL(UnknownSchedulingType, th_type);
2898  }
2899 
2900  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2901 }
2902 
2903 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2904 
2905  int ii, dd;
2906  kmp_team_t *team;
2907  kmp_info_t *thr;
2908 
2909  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2910  KMP_DEBUG_ASSERT(__kmp_init_serial);
2911 
2912  // validate level
2913  if (level == 0)
2914  return 0;
2915  if (level < 0)
2916  return -1;
2917  thr = __kmp_threads[gtid];
2918  team = thr->th.th_team;
2919  ii = team->t.t_level;
2920  if (level > ii)
2921  return -1;
2922 
2923 #if OMP_40_ENABLED
2924  if (thr->th.th_teams_microtask) {
2925  // AC: we are in teams region where multiple nested teams have same level
2926  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2927  if (level <=
2928  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2929  KMP_DEBUG_ASSERT(ii >= tlevel);
2930  // AC: As we need to pass by the teams league, we need to artificially
2931  // increase ii
2932  if (ii == tlevel) {
2933  ii += 2; // three teams have same level
2934  } else {
2935  ii++; // two teams have same level
2936  }
2937  }
2938  }
2939 #endif
2940 
2941  if (ii == level)
2942  return __kmp_tid_from_gtid(gtid);
2943 
2944  dd = team->t.t_serialized;
2945  level++;
2946  while (ii > level) {
2947  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2948  }
2949  if ((team->t.t_serialized) && (!dd)) {
2950  team = team->t.t_parent;
2951  continue;
2952  }
2953  if (ii > level) {
2954  team = team->t.t_parent;
2955  dd = team->t.t_serialized;
2956  ii--;
2957  }
2958  }
2959 
2960  return (dd > 1) ? (0) : (team->t.t_master_tid);
2961 }
2962 
2963 int __kmp_get_team_size(int gtid, int level) {
2964 
2965  int ii, dd;
2966  kmp_team_t *team;
2967  kmp_info_t *thr;
2968 
2969  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2970  KMP_DEBUG_ASSERT(__kmp_init_serial);
2971 
2972  // validate level
2973  if (level == 0)
2974  return 1;
2975  if (level < 0)
2976  return -1;
2977  thr = __kmp_threads[gtid];
2978  team = thr->th.th_team;
2979  ii = team->t.t_level;
2980  if (level > ii)
2981  return -1;
2982 
2983 #if OMP_40_ENABLED
2984  if (thr->th.th_teams_microtask) {
2985  // AC: we are in teams region where multiple nested teams have same level
2986  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2987  if (level <=
2988  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2989  KMP_DEBUG_ASSERT(ii >= tlevel);
2990  // AC: As we need to pass by the teams league, we need to artificially
2991  // increase ii
2992  if (ii == tlevel) {
2993  ii += 2; // three teams have same level
2994  } else {
2995  ii++; // two teams have same level
2996  }
2997  }
2998  }
2999 #endif
3000 
3001  while (ii > level) {
3002  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3003  }
3004  if (team->t.t_serialized && (!dd)) {
3005  team = team->t.t_parent;
3006  continue;
3007  }
3008  if (ii > level) {
3009  team = team->t.t_parent;
3010  ii--;
3011  }
3012  }
3013 
3014  return team->t.t_nproc;
3015 }
3016 
3017 kmp_r_sched_t __kmp_get_schedule_global() {
3018  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3019  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3020  // independently. So one can get the updated schedule here.
3021 
3022  kmp_r_sched_t r_sched;
3023 
3024  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3025  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3026  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3027  // different roots (even in OMP 2.5)
3028  if (__kmp_sched == kmp_sch_static) {
3029  // replace STATIC with more detailed schedule (balanced or greedy)
3030  r_sched.r_sched_type = __kmp_static;
3031  } else if (__kmp_sched == kmp_sch_guided_chunked) {
3032  // replace GUIDED with more detailed schedule (iterative or analytical)
3033  r_sched.r_sched_type = __kmp_guided;
3034  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3035  r_sched.r_sched_type = __kmp_sched;
3036  }
3037 
3038  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3039  // __kmp_chunk may be wrong here (if it was not ever set)
3040  r_sched.chunk = KMP_DEFAULT_CHUNK;
3041  } else {
3042  r_sched.chunk = __kmp_chunk;
3043  }
3044 
3045  return r_sched;
3046 }
3047 
3048 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3049  at least argc number of *t_argv entries for the requested team. */
3050 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3051 
3052  KMP_DEBUG_ASSERT(team);
3053  if (!realloc || argc > team->t.t_max_argc) {
3054 
3055  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3056  "current entries=%d\n",
3057  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3058  /* if previously allocated heap space for args, free them */
3059  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3060  __kmp_free((void *)team->t.t_argv);
3061 
3062  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3063  /* use unused space in the cache line for arguments */
3064  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3065  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3066  "argv entries\n",
3067  team->t.t_id, team->t.t_max_argc));
3068  team->t.t_argv = &team->t.t_inline_argv[0];
3069  if (__kmp_storage_map) {
3070  __kmp_print_storage_map_gtid(
3071  -1, &team->t.t_inline_argv[0],
3072  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3073  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3074  team->t.t_id);
3075  }
3076  } else {
3077  /* allocate space for arguments in the heap */
3078  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3079  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3080  : 2 * argc;
3081  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3082  "argv entries\n",
3083  team->t.t_id, team->t.t_max_argc));
3084  team->t.t_argv =
3085  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3086  if (__kmp_storage_map) {
3087  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3088  &team->t.t_argv[team->t.t_max_argc],
3089  sizeof(void *) * team->t.t_max_argc,
3090  "team_%d.t_argv", team->t.t_id);
3091  }
3092  }
3093  }
3094 }
3095 
3096 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3097  int i;
3098  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3099  team->t.t_threads =
3100  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3101  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3102  sizeof(dispatch_shared_info_t) * num_disp_buff);
3103  team->t.t_dispatch =
3104  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3105  team->t.t_implicit_task_taskdata =
3106  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3107  team->t.t_max_nproc = max_nth;
3108 
3109  /* setup dispatch buffers */
3110  for (i = 0; i < num_disp_buff; ++i) {
3111  team->t.t_disp_buffer[i].buffer_index = i;
3112 #if OMP_45_ENABLED
3113  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3114 #endif
3115  }
3116 }
3117 
3118 static void __kmp_free_team_arrays(kmp_team_t *team) {
3119  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3120  int i;
3121  for (i = 0; i < team->t.t_max_nproc; ++i) {
3122  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3123  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3124  team->t.t_dispatch[i].th_disp_buffer = NULL;
3125  }
3126  }
3127 #if KMP_USE_HIER_SCHED
3128  __kmp_dispatch_free_hierarchies(team);
3129 #endif
3130  __kmp_free(team->t.t_threads);
3131  __kmp_free(team->t.t_disp_buffer);
3132  __kmp_free(team->t.t_dispatch);
3133  __kmp_free(team->t.t_implicit_task_taskdata);
3134  team->t.t_threads = NULL;
3135  team->t.t_disp_buffer = NULL;
3136  team->t.t_dispatch = NULL;
3137  team->t.t_implicit_task_taskdata = 0;
3138 }
3139 
3140 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3141  kmp_info_t **oldThreads = team->t.t_threads;
3142 
3143  __kmp_free(team->t.t_disp_buffer);
3144  __kmp_free(team->t.t_dispatch);
3145  __kmp_free(team->t.t_implicit_task_taskdata);
3146  __kmp_allocate_team_arrays(team, max_nth);
3147 
3148  KMP_MEMCPY(team->t.t_threads, oldThreads,
3149  team->t.t_nproc * sizeof(kmp_info_t *));
3150 
3151  __kmp_free(oldThreads);
3152 }
3153 
3154 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3155 
3156  kmp_r_sched_t r_sched =
3157  __kmp_get_schedule_global(); // get current state of scheduling globals
3158 
3159 #if OMP_40_ENABLED
3160  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3161 #endif /* OMP_40_ENABLED */
3162 
3163  kmp_internal_control_t g_icvs = {
3164  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3165  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3166  // adjustment of threads (per thread)
3167  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3168  // whether blocktime is explicitly set
3169  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3170 #if KMP_USE_MONITOR
3171  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3172 // intervals
3173 #endif
3174  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3175  // next parallel region (per thread)
3176  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3177  __kmp_cg_max_nth, // int thread_limit;
3178  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3179  // for max_active_levels
3180  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3181 // {sched,chunk} pair
3182 #if OMP_40_ENABLED
3183  __kmp_nested_proc_bind.bind_types[0],
3184  __kmp_default_device,
3185 #endif /* OMP_40_ENABLED */
3186  NULL // struct kmp_internal_control *next;
3187  };
3188 
3189  return g_icvs;
3190 }
3191 
3192 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3193 
3194  kmp_internal_control_t gx_icvs;
3195  gx_icvs.serial_nesting_level =
3196  0; // probably =team->t.t_serial like in save_inter_controls
3197  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3198  gx_icvs.next = NULL;
3199 
3200  return gx_icvs;
3201 }
3202 
3203 static void __kmp_initialize_root(kmp_root_t *root) {
3204  int f;
3205  kmp_team_t *root_team;
3206  kmp_team_t *hot_team;
3207  int hot_team_max_nth;
3208  kmp_r_sched_t r_sched =
3209  __kmp_get_schedule_global(); // get current state of scheduling globals
3210  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3211  KMP_DEBUG_ASSERT(root);
3212  KMP_ASSERT(!root->r.r_begin);
3213 
3214  /* setup the root state structure */
3215  __kmp_init_lock(&root->r.r_begin_lock);
3216  root->r.r_begin = FALSE;
3217  root->r.r_active = FALSE;
3218  root->r.r_in_parallel = 0;
3219  root->r.r_blocktime = __kmp_dflt_blocktime;
3220 
3221  /* setup the root team for this task */
3222  /* allocate the root team structure */
3223  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3224 
3225  root_team =
3226  __kmp_allocate_team(root,
3227  1, // new_nproc
3228  1, // max_nproc
3229 #if OMPT_SUPPORT
3230  ompt_data_none, // root parallel id
3231 #endif
3232 #if OMP_40_ENABLED
3233  __kmp_nested_proc_bind.bind_types[0],
3234 #endif
3235  &r_icvs,
3236  0 // argc
3237  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3238  );
3239 #if USE_DEBUGGER
3240  // Non-NULL value should be assigned to make the debugger display the root
3241  // team.
3242  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3243 #endif
3244 
3245  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3246 
3247  root->r.r_root_team = root_team;
3248  root_team->t.t_control_stack_top = NULL;
3249 
3250  /* initialize root team */
3251  root_team->t.t_threads[0] = NULL;
3252  root_team->t.t_nproc = 1;
3253  root_team->t.t_serialized = 1;
3254  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3255  root_team->t.t_sched.sched = r_sched.sched;
3256  KA_TRACE(
3257  20,
3258  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3259  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3260 
3261  /* setup the hot team for this task */
3262  /* allocate the hot team structure */
3263  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3264 
3265  hot_team =
3266  __kmp_allocate_team(root,
3267  1, // new_nproc
3268  __kmp_dflt_team_nth_ub * 2, // max_nproc
3269 #if OMPT_SUPPORT
3270  ompt_data_none, // root parallel id
3271 #endif
3272 #if OMP_40_ENABLED
3273  __kmp_nested_proc_bind.bind_types[0],
3274 #endif
3275  &r_icvs,
3276  0 // argc
3277  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3278  );
3279  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3280 
3281  root->r.r_hot_team = hot_team;
3282  root_team->t.t_control_stack_top = NULL;
3283 
3284  /* first-time initialization */
3285  hot_team->t.t_parent = root_team;
3286 
3287  /* initialize hot team */
3288  hot_team_max_nth = hot_team->t.t_max_nproc;
3289  for (f = 0; f < hot_team_max_nth; ++f) {
3290  hot_team->t.t_threads[f] = NULL;
3291  }
3292  hot_team->t.t_nproc = 1;
3293  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3294  hot_team->t.t_sched.sched = r_sched.sched;
3295  hot_team->t.t_size_changed = 0;
3296 }
3297 
3298 #ifdef KMP_DEBUG
3299 
3300 typedef struct kmp_team_list_item {
3301  kmp_team_p const *entry;
3302  struct kmp_team_list_item *next;
3303 } kmp_team_list_item_t;
3304 typedef kmp_team_list_item_t *kmp_team_list_t;
3305 
3306 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3307  kmp_team_list_t list, // List of teams.
3308  kmp_team_p const *team // Team to add.
3309  ) {
3310 
3311  // List must terminate with item where both entry and next are NULL.
3312  // Team is added to the list only once.
3313  // List is sorted in ascending order by team id.
3314  // Team id is *not* a key.
3315 
3316  kmp_team_list_t l;
3317 
3318  KMP_DEBUG_ASSERT(list != NULL);
3319  if (team == NULL) {
3320  return;
3321  }
3322 
3323  __kmp_print_structure_team_accum(list, team->t.t_parent);
3324  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3325 
3326  // Search list for the team.
3327  l = list;
3328  while (l->next != NULL && l->entry != team) {
3329  l = l->next;
3330  }
3331  if (l->next != NULL) {
3332  return; // Team has been added before, exit.
3333  }
3334 
3335  // Team is not found. Search list again for insertion point.
3336  l = list;
3337  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3338  l = l->next;
3339  }
3340 
3341  // Insert team.
3342  {
3343  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3344  sizeof(kmp_team_list_item_t));
3345  *item = *l;
3346  l->entry = team;
3347  l->next = item;
3348  }
3349 }
3350 
3351 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3352 
3353  ) {
3354  __kmp_printf("%s", title);
3355  if (team != NULL) {
3356  __kmp_printf("%2x %p\n", team->t.t_id, team);
3357  } else {
3358  __kmp_printf(" - (nil)\n");
3359  }
3360 }
3361 
3362 static void __kmp_print_structure_thread(char const *title,
3363  kmp_info_p const *thread) {
3364  __kmp_printf("%s", title);
3365  if (thread != NULL) {
3366  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3367  } else {
3368  __kmp_printf(" - (nil)\n");
3369  }
3370 }
3371 
3372 void __kmp_print_structure(void) {
3373 
3374  kmp_team_list_t list;
3375 
3376  // Initialize list of teams.
3377  list =
3378  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3379  list->entry = NULL;
3380  list->next = NULL;
3381 
3382  __kmp_printf("\n------------------------------\nGlobal Thread "
3383  "Table\n------------------------------\n");
3384  {
3385  int gtid;
3386  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3387  __kmp_printf("%2d", gtid);
3388  if (__kmp_threads != NULL) {
3389  __kmp_printf(" %p", __kmp_threads[gtid]);
3390  }
3391  if (__kmp_root != NULL) {
3392  __kmp_printf(" %p", __kmp_root[gtid]);
3393  }
3394  __kmp_printf("\n");
3395  }
3396  }
3397 
3398  // Print out __kmp_threads array.
3399  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3400  "----------\n");
3401  if (__kmp_threads != NULL) {
3402  int gtid;
3403  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3404  kmp_info_t const *thread = __kmp_threads[gtid];
3405  if (thread != NULL) {
3406  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3407  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3408  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3409  __kmp_print_structure_team(" Serial Team: ",
3410  thread->th.th_serial_team);
3411  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3412  __kmp_print_structure_thread(" Master: ",
3413  thread->th.th_team_master);
3414  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3415  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3416 #if OMP_40_ENABLED
3417  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3418 #endif
3419  __kmp_print_structure_thread(" Next in pool: ",
3420  thread->th.th_next_pool);
3421  __kmp_printf("\n");
3422  __kmp_print_structure_team_accum(list, thread->th.th_team);
3423  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3424  }
3425  }
3426  } else {
3427  __kmp_printf("Threads array is not allocated.\n");
3428  }
3429 
3430  // Print out __kmp_root array.
3431  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3432  "--------\n");
3433  if (__kmp_root != NULL) {
3434  int gtid;
3435  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3436  kmp_root_t const *root = __kmp_root[gtid];
3437  if (root != NULL) {
3438  __kmp_printf("GTID %2d %p:\n", gtid, root);
3439  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3440  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3441  __kmp_print_structure_thread(" Uber Thread: ",
3442  root->r.r_uber_thread);
3443  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3444  __kmp_printf(" In Parallel: %2d\n",
3445  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3446  __kmp_printf("\n");
3447  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3448  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3449  }
3450  }
3451  } else {
3452  __kmp_printf("Ubers array is not allocated.\n");
3453  }
3454 
3455  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3456  "--------\n");
3457  while (list->next != NULL) {
3458  kmp_team_p const *team = list->entry;
3459  int i;
3460  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3461  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3462  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3463  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3464  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3465  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3466  for (i = 0; i < team->t.t_nproc; ++i) {
3467  __kmp_printf(" Thread %2d: ", i);
3468  __kmp_print_structure_thread("", team->t.t_threads[i]);
3469  }
3470  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3471  __kmp_printf("\n");
3472  list = list->next;
3473  }
3474 
3475  // Print out __kmp_thread_pool and __kmp_team_pool.
3476  __kmp_printf("\n------------------------------\nPools\n----------------------"
3477  "--------\n");
3478  __kmp_print_structure_thread("Thread pool: ",
3479  CCAST(kmp_info_t *, __kmp_thread_pool));
3480  __kmp_print_structure_team("Team pool: ",
3481  CCAST(kmp_team_t *, __kmp_team_pool));
3482  __kmp_printf("\n");
3483 
3484  // Free team list.
3485  while (list != NULL) {
3486  kmp_team_list_item_t *item = list;
3487  list = list->next;
3488  KMP_INTERNAL_FREE(item);
3489  }
3490 }
3491 
3492 #endif
3493 
3494 //---------------------------------------------------------------------------
3495 // Stuff for per-thread fast random number generator
3496 // Table of primes
3497 static const unsigned __kmp_primes[] = {
3498  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3499  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3500  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3501  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3502  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3503  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3504  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3505  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3506  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3507  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3508  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3509 
3510 //---------------------------------------------------------------------------
3511 // __kmp_get_random: Get a random number using a linear congruential method.
3512 unsigned short __kmp_get_random(kmp_info_t *thread) {
3513  unsigned x = thread->th.th_x;
3514  unsigned short r = x >> 16;
3515 
3516  thread->th.th_x = x * thread->th.th_a + 1;
3517 
3518  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3519  thread->th.th_info.ds.ds_tid, r));
3520 
3521  return r;
3522 }
3523 //--------------------------------------------------------
3524 // __kmp_init_random: Initialize a random number generator
3525 void __kmp_init_random(kmp_info_t *thread) {
3526  unsigned seed = thread->th.th_info.ds.ds_tid;
3527 
3528  thread->th.th_a =
3529  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3530  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3531  KA_TRACE(30,
3532  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3533 }
3534 
3535 #if KMP_OS_WINDOWS
3536 /* reclaim array entries for root threads that are already dead, returns number
3537  * reclaimed */
3538 static int __kmp_reclaim_dead_roots(void) {
3539  int i, r = 0;
3540 
3541  for (i = 0; i < __kmp_threads_capacity; ++i) {
3542  if (KMP_UBER_GTID(i) &&
3543  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3544  !__kmp_root[i]
3545  ->r.r_active) { // AC: reclaim only roots died in non-active state
3546  r += __kmp_unregister_root_other_thread(i);
3547  }
3548  }
3549  return r;
3550 }
3551 #endif
3552 
3553 /* This function attempts to create free entries in __kmp_threads and
3554  __kmp_root, and returns the number of free entries generated.
3555 
3556  For Windows* OS static library, the first mechanism used is to reclaim array
3557  entries for root threads that are already dead.
3558 
3559  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3560  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3561  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3562  threadprivate cache array has been created. Synchronization with
3563  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3564 
3565  After any dead root reclamation, if the clipping value allows array expansion
3566  to result in the generation of a total of nNeed free slots, the function does
3567  that expansion. If not, nothing is done beyond the possible initial root
3568  thread reclamation.
3569 
3570  If any argument is negative, the behavior is undefined. */
3571 static int __kmp_expand_threads(int nNeed) {
3572  int added = 0;
3573  int minimumRequiredCapacity;
3574  int newCapacity;
3575  kmp_info_t **newThreads;
3576  kmp_root_t **newRoot;
3577 
3578 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3579 // resizing __kmp_threads does not need additional protection if foreign
3580 // threads are present
3581 
3582 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3583  /* only for Windows static library */
3584  /* reclaim array entries for root threads that are already dead */
3585  added = __kmp_reclaim_dead_roots();
3586 
3587  if (nNeed) {
3588  nNeed -= added;
3589  if (nNeed < 0)
3590  nNeed = 0;
3591  }
3592 #endif
3593  if (nNeed <= 0)
3594  return added;
3595 
3596  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3597  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3598  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3599  // > __kmp_max_nth in one of two ways:
3600  //
3601  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3602  // may not be resused by another thread, so we may need to increase
3603  // __kmp_threads_capacity to __kmp_max_nth + 1.
3604  //
3605  // 2) New foreign root(s) are encountered. We always register new foreign
3606  // roots. This may cause a smaller # of threads to be allocated at
3607  // subsequent parallel regions, but the worker threads hang around (and
3608  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3609  //
3610  // Anyway, that is the reason for moving the check to see if
3611  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3612  // instead of having it performed here. -BB
3613 
3614  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3615 
3616  /* compute expansion headroom to check if we can expand */
3617  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3618  /* possible expansion too small -- give up */
3619  return added;
3620  }
3621  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3622 
3623  newCapacity = __kmp_threads_capacity;
3624  do {
3625  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3626  : __kmp_sys_max_nth;
3627  } while (newCapacity < minimumRequiredCapacity);
3628  newThreads = (kmp_info_t **)__kmp_allocate(
3629  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3630  newRoot =
3631  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3632  KMP_MEMCPY(newThreads, __kmp_threads,
3633  __kmp_threads_capacity * sizeof(kmp_info_t *));
3634  KMP_MEMCPY(newRoot, __kmp_root,
3635  __kmp_threads_capacity * sizeof(kmp_root_t *));
3636 
3637  kmp_info_t **temp_threads = __kmp_threads;
3638  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3639  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3640  __kmp_free(temp_threads);
3641  added += newCapacity - __kmp_threads_capacity;
3642  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3643 
3644  if (newCapacity > __kmp_tp_capacity) {
3645  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3646  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3647  __kmp_threadprivate_resize_cache(newCapacity);
3648  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3649  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3650  }
3651  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3652  }
3653 
3654  return added;
3655 }
3656 
3657 /* Register the current thread as a root thread and obtain our gtid. We must
3658  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3659  thread that calls from __kmp_do_serial_initialize() */
3660 int __kmp_register_root(int initial_thread) {
3661  kmp_info_t *root_thread;
3662  kmp_root_t *root;
3663  int gtid;
3664  int capacity;
3665  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3666  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3667  KMP_MB();
3668 
3669  /* 2007-03-02:
3670  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3671  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3672  work as expected -- it may return false (that means there is at least one
3673  empty slot in __kmp_threads array), but it is possible the only free slot
3674  is #0, which is reserved for initial thread and so cannot be used for this
3675  one. Following code workarounds this bug.
3676 
3677  However, right solution seems to be not reserving slot #0 for initial
3678  thread because:
3679  (1) there is no magic in slot #0,
3680  (2) we cannot detect initial thread reliably (the first thread which does
3681  serial initialization may be not a real initial thread).
3682  */
3683  capacity = __kmp_threads_capacity;
3684  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3685  --capacity;
3686  }
3687 
3688  /* see if there are too many threads */
3689  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3690  if (__kmp_tp_cached) {
3691  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3692  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3693  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3694  } else {
3695  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3696  __kmp_msg_null);
3697  }
3698  }
3699 
3700  /* find an available thread slot */
3701  /* Don't reassign the zero slot since we need that to only be used by initial
3702  thread */
3703  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3704  gtid++)
3705  ;
3706  KA_TRACE(1,
3707  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3708  KMP_ASSERT(gtid < __kmp_threads_capacity);
3709 
3710  /* update global accounting */
3711  __kmp_all_nth++;
3712  TCW_4(__kmp_nth, __kmp_nth + 1);
3713 
3714  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3715  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3716  if (__kmp_adjust_gtid_mode) {
3717  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3718  if (TCR_4(__kmp_gtid_mode) != 2) {
3719  TCW_4(__kmp_gtid_mode, 2);
3720  }
3721  } else {
3722  if (TCR_4(__kmp_gtid_mode) != 1) {
3723  TCW_4(__kmp_gtid_mode, 1);
3724  }
3725  }
3726  }
3727 
3728 #ifdef KMP_ADJUST_BLOCKTIME
3729  /* Adjust blocktime to zero if necessary */
3730  /* Middle initialization might not have occurred yet */
3731  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3732  if (__kmp_nth > __kmp_avail_proc) {
3733  __kmp_zero_bt = TRUE;
3734  }
3735  }
3736 #endif /* KMP_ADJUST_BLOCKTIME */
3737 
3738  /* setup this new hierarchy */
3739  if (!(root = __kmp_root[gtid])) {
3740  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3741  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3742  }
3743 
3744 #if KMP_STATS_ENABLED
3745  // Initialize stats as soon as possible (right after gtid assignment).
3746  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3747  __kmp_stats_thread_ptr->startLife();
3748  KMP_SET_THREAD_STATE(SERIAL_REGION);
3749  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3750 #endif
3751  __kmp_initialize_root(root);
3752 
3753  /* setup new root thread structure */
3754  if (root->r.r_uber_thread) {
3755  root_thread = root->r.r_uber_thread;
3756  } else {
3757  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3758  if (__kmp_storage_map) {
3759  __kmp_print_thread_storage_map(root_thread, gtid);
3760  }
3761  root_thread->th.th_info.ds.ds_gtid = gtid;
3762 #if OMPT_SUPPORT
3763  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3764 #endif
3765  root_thread->th.th_root = root;
3766  if (__kmp_env_consistency_check) {
3767  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3768  }
3769 #if USE_FAST_MEMORY
3770  __kmp_initialize_fast_memory(root_thread);
3771 #endif /* USE_FAST_MEMORY */
3772 
3773 #if KMP_USE_BGET
3774  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3775  __kmp_initialize_bget(root_thread);
3776 #endif
3777  __kmp_init_random(root_thread); // Initialize random number generator
3778  }
3779 
3780  /* setup the serial team held in reserve by the root thread */
3781  if (!root_thread->th.th_serial_team) {
3782  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3783  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3784  root_thread->th.th_serial_team =
3785  __kmp_allocate_team(root, 1, 1,
3786 #if OMPT_SUPPORT
3787  ompt_data_none, // root parallel id
3788 #endif
3789 #if OMP_40_ENABLED
3790  proc_bind_default,
3791 #endif
3792  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3793  }
3794  KMP_ASSERT(root_thread->th.th_serial_team);
3795  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3796  root_thread->th.th_serial_team));
3797 
3798  /* drop root_thread into place */
3799  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3800 
3801  root->r.r_root_team->t.t_threads[0] = root_thread;
3802  root->r.r_hot_team->t.t_threads[0] = root_thread;
3803  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3804  // AC: the team created in reserve, not for execution (it is unused for now).
3805  root_thread->th.th_serial_team->t.t_serialized = 0;
3806  root->r.r_uber_thread = root_thread;
3807 
3808  /* initialize the thread, get it ready to go */
3809  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3810  TCW_4(__kmp_init_gtid, TRUE);
3811 
3812  /* prepare the master thread for get_gtid() */
3813  __kmp_gtid_set_specific(gtid);
3814 
3815 #if USE_ITT_BUILD
3816  __kmp_itt_thread_name(gtid);
3817 #endif /* USE_ITT_BUILD */
3818 
3819 #ifdef KMP_TDATA_GTID
3820  __kmp_gtid = gtid;
3821 #endif
3822  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3823  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3824 
3825  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3826  "plain=%u\n",
3827  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3828  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3829  KMP_INIT_BARRIER_STATE));
3830  { // Initialize barrier data.
3831  int b;
3832  for (b = 0; b < bs_last_barrier; ++b) {
3833  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3834 #if USE_DEBUGGER
3835  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3836 #endif
3837  }
3838  }
3839  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3840  KMP_INIT_BARRIER_STATE);
3841 
3842 #if KMP_AFFINITY_SUPPORTED
3843 #if OMP_40_ENABLED
3844  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3845  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3846  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3847  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3848 #endif
3849  if (TCR_4(__kmp_init_middle)) {
3850  __kmp_affinity_set_init_mask(gtid, TRUE);
3851  }
3852 #endif /* KMP_AFFINITY_SUPPORTED */
3853 #if OMP_50_ENABLED
3854  root_thread->th.th_def_allocator = __kmp_def_allocator;
3855  root_thread->th.th_prev_level = 0;
3856  root_thread->th.th_prev_num_threads = 1;
3857 #endif
3858 
3859  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3860  tmp->cg_root = root_thread;
3861  tmp->cg_thread_limit = __kmp_cg_max_nth;
3862  tmp->cg_nthreads = 1;
3863  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3864  " cg_nthreads init to 1\n",
3865  root_thread, tmp));
3866  tmp->up = NULL;
3867  root_thread->th.th_cg_roots = tmp;
3868 
3869  __kmp_root_counter++;
3870 
3871 #if OMPT_SUPPORT
3872  if (!initial_thread && ompt_enabled.enabled) {
3873 
3874  kmp_info_t *root_thread = ompt_get_thread();
3875 
3876  ompt_set_thread_state(root_thread, ompt_state_overhead);
3877 
3878  if (ompt_enabled.ompt_callback_thread_begin) {
3879  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3880  ompt_thread_initial, __ompt_get_thread_data_internal());
3881  }
3882  ompt_data_t *task_data;
3883  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3884  if (ompt_enabled.ompt_callback_task_create) {
3885  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3886  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3887  // initial task has nothing to return to
3888  }
3889 
3890  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3891  }
3892 #endif
3893 
3894  KMP_MB();
3895  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3896 
3897  return gtid;
3898 }
3899 
3900 #if KMP_NESTED_HOT_TEAMS
3901 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3902  const int max_level) {
3903  int i, n, nth;
3904  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3905  if (!hot_teams || !hot_teams[level].hot_team) {
3906  return 0;
3907  }
3908  KMP_DEBUG_ASSERT(level < max_level);
3909  kmp_team_t *team = hot_teams[level].hot_team;
3910  nth = hot_teams[level].hot_team_nth;
3911  n = nth - 1; // master is not freed
3912  if (level < max_level - 1) {
3913  for (i = 0; i < nth; ++i) {
3914  kmp_info_t *th = team->t.t_threads[i];
3915  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3916  if (i > 0 && th->th.th_hot_teams) {
3917  __kmp_free(th->th.th_hot_teams);
3918  th->th.th_hot_teams = NULL;
3919  }
3920  }
3921  }
3922  __kmp_free_team(root, team, NULL);
3923  return n;
3924 }
3925 #endif
3926 
3927 // Resets a root thread and clear its root and hot teams.
3928 // Returns the number of __kmp_threads entries directly and indirectly freed.
3929 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3930  kmp_team_t *root_team = root->r.r_root_team;
3931  kmp_team_t *hot_team = root->r.r_hot_team;
3932  int n = hot_team->t.t_nproc;
3933  int i;
3934 
3935  KMP_DEBUG_ASSERT(!root->r.r_active);
3936 
3937  root->r.r_root_team = NULL;
3938  root->r.r_hot_team = NULL;
3939  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3940  // before call to __kmp_free_team().
3941  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3942 #if KMP_NESTED_HOT_TEAMS
3943  if (__kmp_hot_teams_max_level >
3944  0) { // need to free nested hot teams and their threads if any
3945  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3946  kmp_info_t *th = hot_team->t.t_threads[i];
3947  if (__kmp_hot_teams_max_level > 1) {
3948  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3949  }
3950  if (th->th.th_hot_teams) {
3951  __kmp_free(th->th.th_hot_teams);
3952  th->th.th_hot_teams = NULL;
3953  }
3954  }
3955  }
3956 #endif
3957  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3958 
3959  // Before we can reap the thread, we need to make certain that all other
3960  // threads in the teams that had this root as ancestor have stopped trying to
3961  // steal tasks.
3962  if (__kmp_tasking_mode != tskm_immediate_exec) {
3963  __kmp_wait_to_unref_task_teams();
3964  }
3965 
3966 #if KMP_OS_WINDOWS
3967  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3968  KA_TRACE(
3969  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3970  "\n",
3971  (LPVOID) & (root->r.r_uber_thread->th),
3972  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3973  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3974 #endif /* KMP_OS_WINDOWS */
3975 
3976 #if OMPT_SUPPORT
3977  if (ompt_enabled.ompt_callback_thread_end) {
3978  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3979  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3980  }
3981 #endif
3982 
3983  TCW_4(__kmp_nth,
3984  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3985  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3986  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3987  " to %d\n",
3988  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3989  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3990 
3991  __kmp_reap_thread(root->r.r_uber_thread, 1);
3992 
3993  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3994  // of freeing.
3995  root->r.r_uber_thread = NULL;
3996  /* mark root as no longer in use */
3997  root->r.r_begin = FALSE;
3998 
3999  return n;
4000 }
4001 
4002 void __kmp_unregister_root_current_thread(int gtid) {
4003  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4004  /* this lock should be ok, since unregister_root_current_thread is never
4005  called during an abort, only during a normal close. furthermore, if you
4006  have the forkjoin lock, you should never try to get the initz lock */
4007  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4008  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4009  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4010  "exiting T#%d\n",
4011  gtid));
4012  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4013  return;
4014  }
4015  kmp_root_t *root = __kmp_root[gtid];
4016 
4017  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4018  KMP_ASSERT(KMP_UBER_GTID(gtid));
4019  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4020  KMP_ASSERT(root->r.r_active == FALSE);
4021 
4022  KMP_MB();
4023 
4024 #if OMP_45_ENABLED
4025  kmp_info_t *thread = __kmp_threads[gtid];
4026  kmp_team_t *team = thread->th.th_team;
4027  kmp_task_team_t *task_team = thread->th.th_task_team;
4028 
4029  // we need to wait for the proxy tasks before finishing the thread
4030  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4031 #if OMPT_SUPPORT
4032  // the runtime is shutting down so we won't report any events
4033  thread->th.ompt_thread_info.state = ompt_state_undefined;
4034 #endif
4035  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4036  }
4037 #endif
4038 
4039  __kmp_reset_root(gtid, root);
4040 
4041  /* free up this thread slot */
4042  __kmp_gtid_set_specific(KMP_GTID_DNE);
4043 #ifdef KMP_TDATA_GTID
4044  __kmp_gtid = KMP_GTID_DNE;
4045 #endif
4046 
4047  KMP_MB();
4048  KC_TRACE(10,
4049  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4050 
4051  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4052 }
4053 
4054 #if KMP_OS_WINDOWS
4055 /* __kmp_forkjoin_lock must be already held
4056  Unregisters a root thread that is not the current thread. Returns the number
4057  of __kmp_threads entries freed as a result. */
4058 static int __kmp_unregister_root_other_thread(int gtid) {
4059  kmp_root_t *root = __kmp_root[gtid];
4060  int r;
4061 
4062  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4063  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4064  KMP_ASSERT(KMP_UBER_GTID(gtid));
4065  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4066  KMP_ASSERT(root->r.r_active == FALSE);
4067 
4068  r = __kmp_reset_root(gtid, root);
4069  KC_TRACE(10,
4070  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4071  return r;
4072 }
4073 #endif
4074 
4075 #if KMP_DEBUG
4076 void __kmp_task_info() {
4077 
4078  kmp_int32 gtid = __kmp_entry_gtid();
4079  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4080  kmp_info_t *this_thr = __kmp_threads[gtid];
4081  kmp_team_t *steam = this_thr->th.th_serial_team;
4082  kmp_team_t *team = this_thr->th.th_team;
4083 
4084  __kmp_printf(
4085  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4086  "ptask=%p\n",
4087  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4088  team->t.t_implicit_task_taskdata[tid].td_parent);
4089 }
4090 #endif // KMP_DEBUG
4091 
4092 /* TODO optimize with one big memclr, take out what isn't needed, split
4093  responsibility to workers as much as possible, and delay initialization of
4094  features as much as possible */
4095 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4096  int tid, int gtid) {
4097  /* this_thr->th.th_info.ds.ds_gtid is setup in
4098  kmp_allocate_thread/create_worker.
4099  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4100  kmp_info_t *master = team->t.t_threads[0];
4101  KMP_DEBUG_ASSERT(this_thr != NULL);
4102  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4103  KMP_DEBUG_ASSERT(team);
4104  KMP_DEBUG_ASSERT(team->t.t_threads);
4105  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4106  KMP_DEBUG_ASSERT(master);
4107  KMP_DEBUG_ASSERT(master->th.th_root);
4108 
4109  KMP_MB();
4110 
4111  TCW_SYNC_PTR(this_thr->th.th_team, team);
4112 
4113  this_thr->th.th_info.ds.ds_tid = tid;
4114  this_thr->th.th_set_nproc = 0;
4115  if (__kmp_tasking_mode != tskm_immediate_exec)
4116  // When tasking is possible, threads are not safe to reap until they are
4117  // done tasking; this will be set when tasking code is exited in wait
4118  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4119  else // no tasking --> always safe to reap
4120  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4121 #if OMP_40_ENABLED
4122  this_thr->th.th_set_proc_bind = proc_bind_default;
4123 #if KMP_AFFINITY_SUPPORTED
4124  this_thr->th.th_new_place = this_thr->th.th_current_place;
4125 #endif
4126 #endif
4127  this_thr->th.th_root = master->th.th_root;
4128 
4129  /* setup the thread's cache of the team structure */
4130  this_thr->th.th_team_nproc = team->t.t_nproc;
4131  this_thr->th.th_team_master = master;
4132  this_thr->th.th_team_serialized = team->t.t_serialized;
4133  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4134 
4135  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4136 
4137  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4138  tid, gtid, this_thr, this_thr->th.th_current_task));
4139 
4140  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4141  team, tid, TRUE);
4142 
4143  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4144  tid, gtid, this_thr, this_thr->th.th_current_task));
4145  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4146  // __kmp_initialize_team()?
4147 
4148  /* TODO no worksharing in speculative threads */
4149  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4150 
4151  this_thr->th.th_local.this_construct = 0;
4152 
4153  if (!this_thr->th.th_pri_common) {
4154  this_thr->th.th_pri_common =
4155  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4156  if (__kmp_storage_map) {
4157  __kmp_print_storage_map_gtid(
4158  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4159  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4160  }
4161  this_thr->th.th_pri_head = NULL;
4162  }
4163 
4164  if (this_thr != master && // Master's CG root is initialized elsewhere
4165  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4166  // Make new thread's CG root same as master's
4167  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4168  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4169  // Increment new thread's CG root's counter to add the new thread
4170  this_thr->th.th_cg_roots->cg_nthreads++;
4171  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4172  " node %p of thread %p to %d\n",
4173  this_thr, this_thr->th.th_cg_roots,
4174  this_thr->th.th_cg_roots->cg_root,
4175  this_thr->th.th_cg_roots->cg_nthreads));
4176  this_thr->th.th_current_task->td_icvs.thread_limit =
4177  this_thr->th.th_cg_roots->cg_thread_limit;
4178  }
4179 
4180  /* Initialize dynamic dispatch */
4181  {
4182  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4183  // Use team max_nproc since this will never change for the team.
4184  size_t disp_size =
4185  sizeof(dispatch_private_info_t) *
4186  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4187  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4188  team->t.t_max_nproc));
4189  KMP_ASSERT(dispatch);
4190  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4191  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4192 
4193  dispatch->th_disp_index = 0;
4194 #if OMP_45_ENABLED
4195  dispatch->th_doacross_buf_idx = 0;
4196 #endif
4197  if (!dispatch->th_disp_buffer) {
4198  dispatch->th_disp_buffer =
4199  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4200 
4201  if (__kmp_storage_map) {
4202  __kmp_print_storage_map_gtid(
4203  gtid, &dispatch->th_disp_buffer[0],
4204  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4205  ? 1
4206  : __kmp_dispatch_num_buffers],
4207  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4208  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4209  gtid, team->t.t_id, gtid);
4210  }
4211  } else {
4212  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4213  }
4214 
4215  dispatch->th_dispatch_pr_current = 0;
4216  dispatch->th_dispatch_sh_current = 0;
4217 
4218  dispatch->th_deo_fcn = 0; /* ORDERED */
4219  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4220  }
4221 
4222  this_thr->th.th_next_pool = NULL;
4223 
4224  if (!this_thr->th.th_task_state_memo_stack) {
4225  size_t i;
4226  this_thr->th.th_task_state_memo_stack =
4227  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4228  this_thr->th.th_task_state_top = 0;
4229  this_thr->th.th_task_state_stack_sz = 4;
4230  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4231  ++i) // zero init the stack
4232  this_thr->th.th_task_state_memo_stack[i] = 0;
4233  }
4234 
4235  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4236  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4237 
4238  KMP_MB();
4239 }
4240 
4241 /* allocate a new thread for the requesting team. this is only called from
4242  within a forkjoin critical section. we will first try to get an available
4243  thread from the thread pool. if none is available, we will fork a new one
4244  assuming we are able to create a new one. this should be assured, as the
4245  caller should check on this first. */
4246 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4247  int new_tid) {
4248  kmp_team_t *serial_team;
4249  kmp_info_t *new_thr;
4250  int new_gtid;
4251 
4252  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4253  KMP_DEBUG_ASSERT(root && team);
4254 #if !KMP_NESTED_HOT_TEAMS
4255  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4256 #endif
4257  KMP_MB();
4258 
4259  /* first, try to get one from the thread pool */
4260  if (__kmp_thread_pool) {
4261  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4262  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4263  if (new_thr == __kmp_thread_pool_insert_pt) {
4264  __kmp_thread_pool_insert_pt = NULL;
4265  }
4266  TCW_4(new_thr->th.th_in_pool, FALSE);
4267  __kmp_suspend_initialize_thread(new_thr);
4268  __kmp_lock_suspend_mx(new_thr);
4269  if (new_thr->th.th_active_in_pool == TRUE) {
4270  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4271  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4272  new_thr->th.th_active_in_pool = FALSE;
4273  }
4274 #if KMP_DEBUG
4275  else {
4276  KMP_DEBUG_ASSERT(new_thr->th.th_active == FALSE);
4277  }
4278 #endif
4279  __kmp_unlock_suspend_mx(new_thr);
4280 
4281  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4282  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4283  KMP_ASSERT(!new_thr->th.th_team);
4284  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4285 
4286  /* setup the thread structure */
4287  __kmp_initialize_info(new_thr, team, new_tid,
4288  new_thr->th.th_info.ds.ds_gtid);
4289  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4290 
4291  TCW_4(__kmp_nth, __kmp_nth + 1);
4292 
4293  new_thr->th.th_task_state = 0;
4294  new_thr->th.th_task_state_top = 0;
4295  new_thr->th.th_task_state_stack_sz = 4;
4296 
4297 #ifdef KMP_ADJUST_BLOCKTIME
4298  /* Adjust blocktime back to zero if necessary */
4299  /* Middle initialization might not have occurred yet */
4300  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4301  if (__kmp_nth > __kmp_avail_proc) {
4302  __kmp_zero_bt = TRUE;
4303  }
4304  }
4305 #endif /* KMP_ADJUST_BLOCKTIME */
4306 
4307 #if KMP_DEBUG
4308  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4309  // KMP_BARRIER_PARENT_FLAG.
4310  int b;
4311  kmp_balign_t *balign = new_thr->th.th_bar;
4312  for (b = 0; b < bs_last_barrier; ++b)
4313  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4314 #endif
4315 
4316  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4317  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4318 
4319  KMP_MB();
4320  return new_thr;
4321  }
4322 
4323  /* no, well fork a new one */
4324  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4325  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4326 
4327 #if KMP_USE_MONITOR
4328  // If this is the first worker thread the RTL is creating, then also
4329  // launch the monitor thread. We try to do this as early as possible.
4330  if (!TCR_4(__kmp_init_monitor)) {
4331  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4332  if (!TCR_4(__kmp_init_monitor)) {
4333  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4334  TCW_4(__kmp_init_monitor, 1);
4335  __kmp_create_monitor(&__kmp_monitor);
4336  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4337 #if KMP_OS_WINDOWS
4338  // AC: wait until monitor has started. This is a fix for CQ232808.
4339  // The reason is that if the library is loaded/unloaded in a loop with
4340  // small (parallel) work in between, then there is high probability that
4341  // monitor thread started after the library shutdown. At shutdown it is
4342  // too late to cope with the problem, because when the master is in
4343  // DllMain (process detach) the monitor has no chances to start (it is
4344  // blocked), and master has no means to inform the monitor that the
4345  // library has gone, because all the memory which the monitor can access
4346  // is going to be released/reset.
4347  while (TCR_4(__kmp_init_monitor) < 2) {
4348  KMP_YIELD(TRUE);
4349  }
4350  KF_TRACE(10, ("after monitor thread has started\n"));
4351 #endif
4352  }
4353  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4354  }
4355 #endif
4356 
4357  KMP_MB();
4358  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4359  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4360  }
4361 
4362  /* allocate space for it. */
4363  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4364 
4365  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4366 
4367  if (__kmp_storage_map) {
4368  __kmp_print_thread_storage_map(new_thr, new_gtid);
4369  }
4370 
4371  // add the reserve serialized team, initialized from the team's master thread
4372  {
4373  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4374  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4375  new_thr->th.th_serial_team = serial_team =
4376  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4377 #if OMPT_SUPPORT
4378  ompt_data_none, // root parallel id
4379 #endif
4380 #if OMP_40_ENABLED
4381  proc_bind_default,
4382 #endif
4383  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4384  }
4385  KMP_ASSERT(serial_team);
4386  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4387  // execution (it is unused for now).
4388  serial_team->t.t_threads[0] = new_thr;
4389  KF_TRACE(10,
4390  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4391  new_thr));
4392 
4393  /* setup the thread structures */
4394  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4395 
4396 #if USE_FAST_MEMORY
4397  __kmp_initialize_fast_memory(new_thr);
4398 #endif /* USE_FAST_MEMORY */
4399 
4400 #if KMP_USE_BGET
4401  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4402  __kmp_initialize_bget(new_thr);
4403 #endif
4404 
4405  __kmp_init_random(new_thr); // Initialize random number generator
4406 
4407  /* Initialize these only once when thread is grabbed for a team allocation */
4408  KA_TRACE(20,
4409  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4410  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4411 
4412  int b;
4413  kmp_balign_t *balign = new_thr->th.th_bar;
4414  for (b = 0; b < bs_last_barrier; ++b) {
4415  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4416  balign[b].bb.team = NULL;
4417  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4418  balign[b].bb.use_oncore_barrier = 0;
4419  }
4420 
4421  new_thr->th.th_spin_here = FALSE;
4422  new_thr->th.th_next_waiting = 0;
4423 #if KMP_OS_UNIX
4424  new_thr->th.th_blocking = false;
4425 #endif
4426 
4427 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4428  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4429  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4430  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4431  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4432 #endif
4433 #if OMP_50_ENABLED
4434  new_thr->th.th_def_allocator = __kmp_def_allocator;
4435  new_thr->th.th_prev_level = 0;
4436  new_thr->th.th_prev_num_threads = 1;
4437 #endif
4438 
4439  TCW_4(new_thr->th.th_in_pool, FALSE);
4440  new_thr->th.th_active_in_pool = FALSE;
4441  TCW_4(new_thr->th.th_active, TRUE);
4442 
4443  /* adjust the global counters */
4444  __kmp_all_nth++;
4445  __kmp_nth++;
4446 
4447  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4448  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4449  if (__kmp_adjust_gtid_mode) {
4450  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4451  if (TCR_4(__kmp_gtid_mode) != 2) {
4452  TCW_4(__kmp_gtid_mode, 2);
4453  }
4454  } else {
4455  if (TCR_4(__kmp_gtid_mode) != 1) {
4456  TCW_4(__kmp_gtid_mode, 1);
4457  }
4458  }
4459  }
4460 
4461 #ifdef KMP_ADJUST_BLOCKTIME
4462  /* Adjust blocktime back to zero if necessary */
4463  /* Middle initialization might not have occurred yet */
4464  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4465  if (__kmp_nth > __kmp_avail_proc) {
4466  __kmp_zero_bt = TRUE;
4467  }
4468  }
4469 #endif /* KMP_ADJUST_BLOCKTIME */
4470 
4471  /* actually fork it and create the new worker thread */
4472  KF_TRACE(
4473  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4474  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4475  KF_TRACE(10,
4476  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4477 
4478  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4479  new_gtid));
4480  KMP_MB();
4481  return new_thr;
4482 }
4483 
4484 /* Reinitialize team for reuse.
4485  The hot team code calls this case at every fork barrier, so EPCC barrier
4486  test are extremely sensitive to changes in it, esp. writes to the team
4487  struct, which cause a cache invalidation in all threads.
4488  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4489 static void __kmp_reinitialize_team(kmp_team_t *team,
4490  kmp_internal_control_t *new_icvs,
4491  ident_t *loc) {
4492  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4493  team->t.t_threads[0], team));
4494  KMP_DEBUG_ASSERT(team && new_icvs);
4495  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4496  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4497 
4498  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4499  // Copy ICVs to the master thread's implicit taskdata
4500  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4501  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4502 
4503  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4504  team->t.t_threads[0], team));
4505 }
4506 
4507 /* Initialize the team data structure.
4508  This assumes the t_threads and t_max_nproc are already set.
4509  Also, we don't touch the arguments */
4510 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4511  kmp_internal_control_t *new_icvs,
4512  ident_t *loc) {
4513  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4514 
4515  /* verify */
4516  KMP_DEBUG_ASSERT(team);
4517  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4518  KMP_DEBUG_ASSERT(team->t.t_threads);
4519  KMP_MB();
4520 
4521  team->t.t_master_tid = 0; /* not needed */
4522  /* team->t.t_master_bar; not needed */
4523  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4524  team->t.t_nproc = new_nproc;
4525 
4526  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4527  team->t.t_next_pool = NULL;
4528  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4529  * up hot team */
4530 
4531  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4532  team->t.t_invoke = NULL; /* not needed */
4533 
4534  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4535  team->t.t_sched.sched = new_icvs->sched.sched;
4536 
4537 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4538  team->t.t_fp_control_saved = FALSE; /* not needed */
4539  team->t.t_x87_fpu_control_word = 0; /* not needed */
4540  team->t.t_mxcsr = 0; /* not needed */
4541 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4542 
4543  team->t.t_construct = 0;
4544 
4545  team->t.t_ordered.dt.t_value = 0;
4546  team->t.t_master_active = FALSE;
4547 
4548 #ifdef KMP_DEBUG
4549  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4550 #endif
4551 #if KMP_OS_WINDOWS
4552  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4553 #endif
4554 
4555  team->t.t_control_stack_top = NULL;
4556 
4557  __kmp_reinitialize_team(team, new_icvs, loc);
4558 
4559  KMP_MB();
4560  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4561 }
4562 
4563 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4564 /* Sets full mask for thread and returns old mask, no changes to structures. */
4565 static void
4566 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4567  if (KMP_AFFINITY_CAPABLE()) {
4568  int status;
4569  if (old_mask != NULL) {
4570  status = __kmp_get_system_affinity(old_mask, TRUE);
4571  int error = errno;
4572  if (status != 0) {
4573  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4574  __kmp_msg_null);
4575  }
4576  }
4577  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4578  }
4579 }
4580 #endif
4581 
4582 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4583 
4584 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4585 // It calculats the worker + master thread's partition based upon the parent
4586 // thread's partition, and binds each worker to a thread in their partition.
4587 // The master thread's partition should already include its current binding.
4588 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4589  // Copy the master thread's place partion to the team struct
4590  kmp_info_t *master_th = team->t.t_threads[0];
4591  KMP_DEBUG_ASSERT(master_th != NULL);
4592  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4593  int first_place = master_th->th.th_first_place;
4594  int last_place = master_th->th.th_last_place;
4595  int masters_place = master_th->th.th_current_place;
4596  team->t.t_first_place = first_place;
4597  team->t.t_last_place = last_place;
4598 
4599  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4600  "bound to place %d partition = [%d,%d]\n",
4601  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4602  team->t.t_id, masters_place, first_place, last_place));
4603 
4604  switch (proc_bind) {
4605 
4606  case proc_bind_default:
4607  // serial teams might have the proc_bind policy set to proc_bind_default. It
4608  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4609  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4610  break;
4611 
4612  case proc_bind_master: {
4613  int f;
4614  int n_th = team->t.t_nproc;
4615  for (f = 1; f < n_th; f++) {
4616  kmp_info_t *th = team->t.t_threads[f];
4617  KMP_DEBUG_ASSERT(th != NULL);
4618  th->th.th_first_place = first_place;
4619  th->th.th_last_place = last_place;
4620  th->th.th_new_place = masters_place;
4621 #if OMP_50_ENABLED
4622  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4623  team->t.t_display_affinity != 1) {
4624  team->t.t_display_affinity = 1;
4625  }
4626 #endif
4627 
4628  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4629  "partition = [%d,%d]\n",
4630  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4631  f, masters_place, first_place, last_place));
4632  }
4633  } break;
4634 
4635  case proc_bind_close: {
4636  int f;
4637  int n_th = team->t.t_nproc;
4638  int n_places;
4639  if (first_place <= last_place) {
4640  n_places = last_place - first_place + 1;
4641  } else {
4642  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4643  }
4644  if (n_th <= n_places) {
4645  int place = masters_place;
4646  for (f = 1; f < n_th; f++) {
4647  kmp_info_t *th = team->t.t_threads[f];
4648  KMP_DEBUG_ASSERT(th != NULL);
4649 
4650  if (place == last_place) {
4651  place = first_place;
4652  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4653  place = 0;
4654  } else {
4655  place++;
4656  }
4657  th->th.th_first_place = first_place;
4658  th->th.th_last_place = last_place;
4659  th->th.th_new_place = place;
4660 #if OMP_50_ENABLED
4661  if (__kmp_display_affinity && place != th->th.th_current_place &&
4662  team->t.t_display_affinity != 1) {
4663  team->t.t_display_affinity = 1;
4664  }
4665 #endif
4666 
4667  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4668  "partition = [%d,%d]\n",
4669  __kmp_gtid_from_thread(team->t.t_threads[f]),
4670  team->t.t_id, f, place, first_place, last_place));
4671  }
4672  } else {
4673  int S, rem, gap, s_count;
4674  S = n_th / n_places;
4675  s_count = 0;
4676  rem = n_th - (S * n_places);
4677  gap = rem > 0 ? n_places / rem : n_places;
4678  int place = masters_place;
4679  int gap_ct = gap;
4680  for (f = 0; f < n_th; f++) {
4681  kmp_info_t *th = team->t.t_threads[f];
4682  KMP_DEBUG_ASSERT(th != NULL);
4683 
4684  th->th.th_first_place = first_place;
4685  th->th.th_last_place = last_place;
4686  th->th.th_new_place = place;
4687 #if OMP_50_ENABLED
4688  if (__kmp_display_affinity && place != th->th.th_current_place &&
4689  team->t.t_display_affinity != 1) {
4690  team->t.t_display_affinity = 1;
4691  }
4692 #endif
4693  s_count++;
4694 
4695  if ((s_count == S) && rem && (gap_ct == gap)) {
4696  // do nothing, add an extra thread to place on next iteration
4697  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4698  // we added an extra thread to this place; move to next place
4699  if (place == last_place) {
4700  place = first_place;
4701  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4702  place = 0;
4703  } else {
4704  place++;
4705  }
4706  s_count = 0;
4707  gap_ct = 1;
4708  rem--;
4709  } else if (s_count == S) { // place full; don't add extra
4710  if (place == last_place) {
4711  place = first_place;
4712  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4713  place = 0;
4714  } else {
4715  place++;
4716  }
4717  gap_ct++;
4718  s_count = 0;
4719  }
4720 
4721  KA_TRACE(100,
4722  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4723  "partition = [%d,%d]\n",
4724  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4725  th->th.th_new_place, first_place, last_place));
4726  }
4727  KMP_DEBUG_ASSERT(place == masters_place);
4728  }
4729  } break;
4730 
4731  case proc_bind_spread: {
4732  int f;
4733  int n_th = team->t.t_nproc;
4734  int n_places;
4735  int thidx;
4736  if (first_place <= last_place) {
4737  n_places = last_place - first_place + 1;
4738  } else {
4739  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4740  }
4741  if (n_th <= n_places) {
4742  int place = -1;
4743 
4744  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4745  int S = n_places / n_th;
4746  int s_count, rem, gap, gap_ct;
4747 
4748  place = masters_place;
4749  rem = n_places - n_th * S;
4750  gap = rem ? n_th / rem : 1;
4751  gap_ct = gap;
4752  thidx = n_th;
4753  if (update_master_only == 1)
4754  thidx = 1;
4755  for (f = 0; f < thidx; f++) {
4756  kmp_info_t *th = team->t.t_threads[f];
4757  KMP_DEBUG_ASSERT(th != NULL);
4758 
4759  th->th.th_first_place = place;
4760  th->th.th_new_place = place;
4761 #if OMP_50_ENABLED
4762  if (__kmp_display_affinity && place != th->th.th_current_place &&
4763  team->t.t_display_affinity != 1) {
4764  team->t.t_display_affinity = 1;
4765  }
4766 #endif
4767  s_count = 1;
4768  while (s_count < S) {
4769  if (place == last_place) {
4770  place = first_place;
4771  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4772  place = 0;
4773  } else {
4774  place++;
4775  }
4776  s_count++;
4777  }
4778  if (rem && (gap_ct == gap)) {
4779  if (place == last_place) {
4780  place = first_place;
4781  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4782  place = 0;
4783  } else {
4784  place++;
4785  }
4786  rem--;
4787  gap_ct = 0;
4788  }
4789  th->th.th_last_place = place;
4790  gap_ct++;
4791 
4792  if (place == last_place) {
4793  place = first_place;
4794  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4795  place = 0;
4796  } else {
4797  place++;
4798  }
4799 
4800  KA_TRACE(100,
4801  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4802  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4803  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4804  f, th->th.th_new_place, th->th.th_first_place,
4805  th->th.th_last_place, __kmp_affinity_num_masks));
4806  }
4807  } else {
4808  /* Having uniform space of available computation places I can create
4809  T partitions of round(P/T) size and put threads into the first
4810  place of each partition. */
4811  double current = static_cast<double>(masters_place);
4812  double spacing =
4813  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4814  int first, last;
4815  kmp_info_t *th;
4816 
4817  thidx = n_th + 1;
4818  if (update_master_only == 1)
4819  thidx = 1;
4820  for (f = 0; f < thidx; f++) {
4821  first = static_cast<int>(current);
4822  last = static_cast<int>(current + spacing) - 1;
4823  KMP_DEBUG_ASSERT(last >= first);
4824  if (first >= n_places) {
4825  if (masters_place) {
4826  first -= n_places;
4827  last -= n_places;
4828  if (first == (masters_place + 1)) {
4829  KMP_DEBUG_ASSERT(f == n_th);
4830  first--;
4831  }
4832  if (last == masters_place) {
4833  KMP_DEBUG_ASSERT(f == (n_th - 1));
4834  last--;
4835  }
4836  } else {
4837  KMP_DEBUG_ASSERT(f == n_th);
4838  first = 0;
4839  last = 0;
4840  }
4841  }
4842  if (last >= n_places) {
4843  last = (n_places - 1);
4844  }
4845  place = first;
4846  current += spacing;
4847  if (f < n_th) {
4848  KMP_DEBUG_ASSERT(0 <= first);
4849  KMP_DEBUG_ASSERT(n_places > first);
4850  KMP_DEBUG_ASSERT(0 <= last);
4851  KMP_DEBUG_ASSERT(n_places > last);
4852  KMP_DEBUG_ASSERT(last_place >= first_place);
4853  th = team->t.t_threads[f];
4854  KMP_DEBUG_ASSERT(th);
4855  th->th.th_first_place = first;
4856  th->th.th_new_place = place;
4857  th->th.th_last_place = last;
4858 #if OMP_50_ENABLED
4859  if (__kmp_display_affinity && place != th->th.th_current_place &&
4860  team->t.t_display_affinity != 1) {
4861  team->t.t_display_affinity = 1;
4862  }
4863 #endif
4864  KA_TRACE(100,
4865  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4866  "partition = [%d,%d], spacing = %.4f\n",
4867  __kmp_gtid_from_thread(team->t.t_threads[f]),
4868  team->t.t_id, f, th->th.th_new_place,
4869  th->th.th_first_place, th->th.th_last_place, spacing));
4870  }
4871  }
4872  }
4873  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4874  } else {
4875  int S, rem, gap, s_count;
4876  S = n_th / n_places;
4877  s_count = 0;
4878  rem = n_th - (S * n_places);
4879  gap = rem > 0 ? n_places / rem : n_places;
4880  int place = masters_place;
4881  int gap_ct = gap;
4882  thidx = n_th;
4883  if (update_master_only == 1)
4884  thidx = 1;
4885  for (f = 0; f < thidx; f++) {
4886  kmp_info_t *th = team->t.t_threads[f];
4887  KMP_DEBUG_ASSERT(th != NULL);
4888 
4889  th->th.th_first_place = place;
4890  th->th.th_last_place = place;
4891  th->th.th_new_place = place;
4892 #if OMP_50_ENABLED
4893  if (__kmp_display_affinity && place != th->th.th_current_place &&
4894  team->t.t_display_affinity != 1) {
4895  team->t.t_display_affinity = 1;
4896  }
4897 #endif
4898  s_count++;
4899 
4900  if ((s_count == S) && rem && (gap_ct == gap)) {
4901  // do nothing, add an extra thread to place on next iteration
4902  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4903  // we added an extra thread to this place; move on to next place
4904  if (place == last_place) {
4905  place = first_place;
4906  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4907  place = 0;
4908  } else {
4909  place++;
4910  }
4911  s_count = 0;
4912  gap_ct = 1;
4913  rem--;
4914  } else if (s_count == S) { // place is full; don't add extra thread
4915  if (place == last_place) {
4916  place = first_place;
4917  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4918  place = 0;
4919  } else {
4920  place++;
4921  }
4922  gap_ct++;
4923  s_count = 0;
4924  }
4925 
4926  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4927  "partition = [%d,%d]\n",
4928  __kmp_gtid_from_thread(team->t.t_threads[f]),
4929  team->t.t_id, f, th->th.th_new_place,
4930  th->th.th_first_place, th->th.th_last_place));
4931  }
4932  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4933  }
4934  } break;
4935 
4936  default:
4937  break;
4938  }
4939 
4940  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4941 }
4942 
4943 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4944 
4945 /* allocate a new team data structure to use. take one off of the free pool if
4946  available */
4947 kmp_team_t *
4948 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4949 #if OMPT_SUPPORT
4950  ompt_data_t ompt_parallel_data,
4951 #endif
4952 #if OMP_40_ENABLED
4953  kmp_proc_bind_t new_proc_bind,
4954 #endif
4955  kmp_internal_control_t *new_icvs,
4956  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4957  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4958  int f;
4959  kmp_team_t *team;
4960  int use_hot_team = !root->r.r_active;
4961  int level = 0;
4962 
4963  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4964  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4965  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4966  KMP_MB();
4967 
4968 #if KMP_NESTED_HOT_TEAMS
4969  kmp_hot_team_ptr_t *hot_teams;
4970  if (master) {
4971  team = master->th.th_team;
4972  level = team->t.t_active_level;
4973  if (master->th.th_teams_microtask) { // in teams construct?
4974  if (master->th.th_teams_size.nteams > 1 &&
4975  ( // #teams > 1
4976  team->t.t_pkfn ==
4977  (microtask_t)__kmp_teams_master || // inner fork of the teams
4978  master->th.th_teams_level <
4979  team->t.t_level)) { // or nested parallel inside the teams
4980  ++level; // not increment if #teams==1, or for outer fork of the teams;
4981  // increment otherwise
4982  }
4983  }
4984  hot_teams = master->th.th_hot_teams;
4985  if (level < __kmp_hot_teams_max_level && hot_teams &&
4986  hot_teams[level]
4987  .hot_team) { // hot team has already been allocated for given level
4988  use_hot_team = 1;
4989  } else {
4990  use_hot_team = 0;
4991  }
4992  }
4993 #endif
4994  // Optimization to use a "hot" team
4995  if (use_hot_team && new_nproc > 1) {
4996  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4997 #if KMP_NESTED_HOT_TEAMS
4998  team = hot_teams[level].hot_team;
4999 #else
5000  team = root->r.r_hot_team;
5001 #endif
5002 #if KMP_DEBUG
5003  if (__kmp_tasking_mode != tskm_immediate_exec) {
5004  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5005  "task_team[1] = %p before reinit\n",
5006  team->t.t_task_team[0], team->t.t_task_team[1]));
5007  }
5008 #endif
5009 
5010  // Has the number of threads changed?
5011  /* Let's assume the most common case is that the number of threads is
5012  unchanged, and put that case first. */
5013  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5014  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5015  // This case can mean that omp_set_num_threads() was called and the hot
5016  // team size was already reduced, so we check the special flag
5017  if (team->t.t_size_changed == -1) {
5018  team->t.t_size_changed = 1;
5019  } else {
5020  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5021  }
5022 
5023  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5024  kmp_r_sched_t new_sched = new_icvs->sched;
5025  // set master's schedule as new run-time schedule
5026  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5027 
5028  __kmp_reinitialize_team(team, new_icvs,
5029  root->r.r_uber_thread->th.th_ident);
5030 
5031  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5032  team->t.t_threads[0], team));
5033  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5034 
5035 #if OMP_40_ENABLED
5036 #if KMP_AFFINITY_SUPPORTED
5037  if ((team->t.t_size_changed == 0) &&
5038  (team->t.t_proc_bind == new_proc_bind)) {
5039  if (new_proc_bind == proc_bind_spread) {
5040  __kmp_partition_places(
5041  team, 1); // add flag to update only master for spread
5042  }
5043  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5044  "proc_bind = %d, partition = [%d,%d]\n",
5045  team->t.t_id, new_proc_bind, team->t.t_first_place,
5046  team->t.t_last_place));
5047  } else {
5048  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5049  __kmp_partition_places(team);
5050  }
5051 #else
5052  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5053 #endif /* KMP_AFFINITY_SUPPORTED */
5054 #endif /* OMP_40_ENABLED */
5055  } else if (team->t.t_nproc > new_nproc) {
5056  KA_TRACE(20,
5057  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5058  new_nproc));
5059 
5060  team->t.t_size_changed = 1;
5061 #if KMP_NESTED_HOT_TEAMS
5062  if (__kmp_hot_teams_mode == 0) {
5063  // AC: saved number of threads should correspond to team's value in this
5064  // mode, can be bigger in mode 1, when hot team has threads in reserve
5065  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5066  hot_teams[level].hot_team_nth = new_nproc;
5067 #endif // KMP_NESTED_HOT_TEAMS
5068  /* release the extra threads we don't need any more */
5069  for (f = new_nproc; f < team->t.t_nproc; f++) {
5070  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5071  if (__kmp_tasking_mode != tskm_immediate_exec) {
5072  // When decreasing team size, threads no longer in the team should
5073  // unref task team.
5074  team->t.t_threads[f]->th.th_task_team = NULL;
5075  }
5076  __kmp_free_thread(team->t.t_threads[f]);
5077  team->t.t_threads[f] = NULL;
5078  }
5079 #if KMP_NESTED_HOT_TEAMS
5080  } // (__kmp_hot_teams_mode == 0)
5081  else {
5082  // When keeping extra threads in team, switch threads to wait on own
5083  // b_go flag
5084  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5085  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5086  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5087  for (int b = 0; b < bs_last_barrier; ++b) {
5088  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5089  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5090  }
5091  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5092  }
5093  }
5094  }
5095 #endif // KMP_NESTED_HOT_TEAMS
5096  team->t.t_nproc = new_nproc;
5097  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5098  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5099  __kmp_reinitialize_team(team, new_icvs,
5100  root->r.r_uber_thread->th.th_ident);
5101 
5102  // Update remaining threads
5103  for (f = 0; f < new_nproc; ++f) {
5104  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5105  }
5106 
5107  // restore the current task state of the master thread: should be the
5108  // implicit task
5109  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5110  team->t.t_threads[0], team));
5111 
5112  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5113 
5114 #ifdef KMP_DEBUG
5115  for (f = 0; f < team->t.t_nproc; f++) {
5116  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5117  team->t.t_threads[f]->th.th_team_nproc ==
5118  team->t.t_nproc);
5119  }
5120 #endif
5121 
5122 #if OMP_40_ENABLED
5123  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5124 #if KMP_AFFINITY_SUPPORTED
5125  __kmp_partition_places(team);
5126 #endif
5127 #endif
5128  } else { // team->t.t_nproc < new_nproc
5129 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5130  kmp_affin_mask_t *old_mask;
5131  if (KMP_AFFINITY_CAPABLE()) {
5132  KMP_CPU_ALLOC(old_mask);
5133  }
5134 #endif
5135 
5136  KA_TRACE(20,
5137  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5138  new_nproc));
5139 
5140  team->t.t_size_changed = 1;
5141 
5142 #if KMP_NESTED_HOT_TEAMS
5143  int avail_threads = hot_teams[level].hot_team_nth;
5144  if (new_nproc < avail_threads)
5145  avail_threads = new_nproc;
5146  kmp_info_t **other_threads = team->t.t_threads;
5147  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5148  // Adjust barrier data of reserved threads (if any) of the team
5149  // Other data will be set in __kmp_initialize_info() below.
5150  int b;
5151  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5152  for (b = 0; b < bs_last_barrier; ++b) {
5153  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5154  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5155 #if USE_DEBUGGER
5156  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5157 #endif
5158  }
5159  }
5160  if (hot_teams[level].hot_team_nth >= new_nproc) {
5161  // we have all needed threads in reserve, no need to allocate any
5162  // this only possible in mode 1, cannot have reserved threads in mode 0
5163  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5164  team->t.t_nproc = new_nproc; // just get reserved threads involved
5165  } else {
5166  // we may have some threads in reserve, but not enough
5167  team->t.t_nproc =
5168  hot_teams[level]
5169  .hot_team_nth; // get reserved threads involved if any
5170  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5171 #endif // KMP_NESTED_HOT_TEAMS
5172  if (team->t.t_max_nproc < new_nproc) {
5173  /* reallocate larger arrays */
5174  __kmp_reallocate_team_arrays(team, new_nproc);
5175  __kmp_reinitialize_team(team, new_icvs, NULL);
5176  }
5177 
5178 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5179  /* Temporarily set full mask for master thread before creation of
5180  workers. The reason is that workers inherit the affinity from master,
5181  so if a lot of workers are created on the single core quickly, they
5182  don't get a chance to set their own affinity for a long time. */
5183  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5184 #endif
5185 
5186  /* allocate new threads for the hot team */
5187  for (f = team->t.t_nproc; f < new_nproc; f++) {
5188  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5189  KMP_DEBUG_ASSERT(new_worker);
5190  team->t.t_threads[f] = new_worker;
5191 
5192  KA_TRACE(20,
5193  ("__kmp_allocate_team: team %d init T#%d arrived: "
5194  "join=%llu, plain=%llu\n",
5195  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5196  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5197  team->t.t_bar[bs_plain_barrier].b_arrived));
5198 
5199  { // Initialize barrier data for new threads.
5200  int b;
5201  kmp_balign_t *balign = new_worker->th.th_bar;
5202  for (b = 0; b < bs_last_barrier; ++b) {
5203  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5204  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5205  KMP_BARRIER_PARENT_FLAG);
5206 #if USE_DEBUGGER
5207  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5208 #endif
5209  }
5210  }
5211  }
5212 
5213 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5214  if (KMP_AFFINITY_CAPABLE()) {
5215  /* Restore initial master thread's affinity mask */
5216  __kmp_set_system_affinity(old_mask, TRUE);
5217  KMP_CPU_FREE(old_mask);
5218  }
5219 #endif
5220 #if KMP_NESTED_HOT_TEAMS
5221  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5222 #endif // KMP_NESTED_HOT_TEAMS
5223  /* make sure everyone is syncronized */
5224  int old_nproc = team->t.t_nproc; // save old value and use to update only
5225  // new threads below
5226  __kmp_initialize_team(team, new_nproc, new_icvs,
5227  root->r.r_uber_thread->th.th_ident);
5228 
5229  /* reinitialize the threads */
5230  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5231  for (f = 0; f < team->t.t_nproc; ++f)
5232  __kmp_initialize_info(team->t.t_threads[f], team, f,
5233  __kmp_gtid_from_tid(f, team));
5234 
5235  if (level) { // set th_task_state for new threads in nested hot team
5236  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5237  // only need to set the th_task_state for the new threads. th_task_state
5238  // for master thread will not be accurate until after this in
5239  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5240  // correct value.
5241  for (f = old_nproc; f < team->t.t_nproc; ++f)
5242  team->t.t_threads[f]->th.th_task_state =
5243  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5244  } else { // set th_task_state for new threads in non-nested hot team
5245  int old_state =
5246  team->t.t_threads[0]->th.th_task_state; // copy master's state
5247  for (f = old_nproc; f < team->t.t_nproc; ++f)
5248  team->t.t_threads[f]->th.th_task_state = old_state;
5249  }
5250 
5251 #ifdef KMP_DEBUG
5252  for (f = 0; f < team->t.t_nproc; ++f) {
5253  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5254  team->t.t_threads[f]->th.th_team_nproc ==
5255  team->t.t_nproc);
5256  }
5257 #endif
5258 
5259 #if OMP_40_ENABLED
5260  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5261 #if KMP_AFFINITY_SUPPORTED
5262  __kmp_partition_places(team);
5263 #endif
5264 #endif
5265  } // Check changes in number of threads
5266 
5267 #if OMP_40_ENABLED
5268  kmp_info_t *master = team->t.t_threads[0];
5269  if (master->th.th_teams_microtask) {
5270  for (f = 1; f < new_nproc; ++f) {
5271  // propagate teams construct specific info to workers
5272  kmp_info_t *thr = team->t.t_threads[f];
5273  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5274  thr->th.th_teams_level = master->th.th_teams_level;
5275  thr->th.th_teams_size = master->th.th_teams_size;
5276  }
5277  }
5278 #endif /* OMP_40_ENABLED */
5279 #if KMP_NESTED_HOT_TEAMS
5280  if (level) {
5281  // Sync barrier state for nested hot teams, not needed for outermost hot
5282  // team.
5283  for (f = 1; f < new_nproc; ++f) {
5284  kmp_info_t *thr = team->t.t_threads[f];
5285  int b;
5286  kmp_balign_t *balign = thr->th.th_bar;
5287  for (b = 0; b < bs_last_barrier; ++b) {
5288  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5289  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5290 #if USE_DEBUGGER
5291  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5292 #endif
5293  }
5294  }
5295  }
5296 #endif // KMP_NESTED_HOT_TEAMS
5297 
5298  /* reallocate space for arguments if necessary */
5299  __kmp_alloc_argv_entries(argc, team, TRUE);
5300  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5301  // The hot team re-uses the previous task team,
5302  // if untouched during the previous release->gather phase.
5303 
5304  KF_TRACE(10, (" hot_team = %p\n", team));
5305 
5306 #if KMP_DEBUG
5307  if (__kmp_tasking_mode != tskm_immediate_exec) {
5308  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5309  "task_team[1] = %p after reinit\n",
5310  team->t.t_task_team[0], team->t.t_task_team[1]));
5311  }
5312 #endif
5313 
5314 #if OMPT_SUPPORT
5315  __ompt_team_assign_id(team, ompt_parallel_data);
5316 #endif
5317 
5318  KMP_MB();
5319 
5320  return team;
5321  }
5322 
5323  /* next, let's try to take one from the team pool */
5324  KMP_MB();
5325  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5326  /* TODO: consider resizing undersized teams instead of reaping them, now
5327  that we have a resizing mechanism */
5328  if (team->t.t_max_nproc >= max_nproc) {
5329  /* take this team from the team pool */
5330  __kmp_team_pool = team->t.t_next_pool;
5331 
5332  /* setup the team for fresh use */
5333  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5334 
5335  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5336  "task_team[1] %p to NULL\n",
5337  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5338  team->t.t_task_team[0] = NULL;
5339  team->t.t_task_team[1] = NULL;
5340 
5341  /* reallocate space for arguments if necessary */
5342  __kmp_alloc_argv_entries(argc, team, TRUE);
5343  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5344 
5345  KA_TRACE(
5346  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5347  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5348  { // Initialize barrier data.
5349  int b;
5350  for (b = 0; b < bs_last_barrier; ++b) {
5351  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5352 #if USE_DEBUGGER
5353  team->t.t_bar[b].b_master_arrived = 0;
5354  team->t.t_bar[b].b_team_arrived = 0;
5355 #endif
5356  }
5357  }
5358 
5359 #if OMP_40_ENABLED
5360  team->t.t_proc_bind = new_proc_bind;
5361 #endif
5362 
5363  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5364  team->t.t_id));
5365 
5366 #if OMPT_SUPPORT
5367  __ompt_team_assign_id(team, ompt_parallel_data);
5368 #endif
5369 
5370  KMP_MB();
5371 
5372  return team;
5373  }
5374 
5375  /* reap team if it is too small, then loop back and check the next one */
5376  // not sure if this is wise, but, will be redone during the hot-teams
5377  // rewrite.
5378  /* TODO: Use technique to find the right size hot-team, don't reap them */
5379  team = __kmp_reap_team(team);
5380  __kmp_team_pool = team;
5381  }
5382 
5383  /* nothing available in the pool, no matter, make a new team! */
5384  KMP_MB();
5385  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5386 
5387  /* and set it up */
5388  team->t.t_max_nproc = max_nproc;
5389  /* NOTE well, for some reason allocating one big buffer and dividing it up
5390  seems to really hurt performance a lot on the P4, so, let's not use this */
5391  __kmp_allocate_team_arrays(team, max_nproc);
5392 
5393  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5394  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5395 
5396  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5397  "%p to NULL\n",
5398  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5399  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5400  // memory, no need to duplicate
5401  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5402  // memory, no need to duplicate
5403 
5404  if (__kmp_storage_map) {
5405  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5406  }
5407 
5408  /* allocate space for arguments */
5409  __kmp_alloc_argv_entries(argc, team, FALSE);
5410  team->t.t_argc = argc;
5411 
5412  KA_TRACE(20,
5413  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5414  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5415  { // Initialize barrier data.
5416  int b;
5417  for (b = 0; b < bs_last_barrier; ++b) {
5418  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5419 #if USE_DEBUGGER
5420  team->t.t_bar[b].b_master_arrived = 0;
5421  team->t.t_bar[b].b_team_arrived = 0;
5422 #endif
5423  }
5424  }
5425 
5426 #if OMP_40_ENABLED
5427  team->t.t_proc_bind = new_proc_bind;
5428 #endif
5429 
5430 #if OMPT_SUPPORT
5431  __ompt_team_assign_id(team, ompt_parallel_data);
5432  team->t.ompt_serialized_team_info = NULL;
5433 #endif
5434 
5435  KMP_MB();
5436 
5437  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5438  team->t.t_id));
5439 
5440  return team;
5441 }
5442 
5443 /* TODO implement hot-teams at all levels */
5444 /* TODO implement lazy thread release on demand (disband request) */
5445 
5446 /* free the team. return it to the team pool. release all the threads
5447  * associated with it */
5448 void __kmp_free_team(kmp_root_t *root,
5449  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5450  int f;
5451  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5452  team->t.t_id));
5453 
5454  /* verify state */
5455  KMP_DEBUG_ASSERT(root);
5456  KMP_DEBUG_ASSERT(team);
5457  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5458  KMP_DEBUG_ASSERT(team->t.t_threads);
5459 
5460  int use_hot_team = team == root->r.r_hot_team;
5461 #if KMP_NESTED_HOT_TEAMS
5462  int level;
5463  kmp_hot_team_ptr_t *hot_teams;
5464  if (master) {
5465  level = team->t.t_active_level - 1;
5466  if (master->th.th_teams_microtask) { // in teams construct?
5467  if (master->th.th_teams_size.nteams > 1) {
5468  ++level; // level was not increased in teams construct for
5469  // team_of_masters
5470  }
5471  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5472  master->th.th_teams_level == team->t.t_level) {
5473  ++level; // level was not increased in teams construct for
5474  // team_of_workers before the parallel
5475  } // team->t.t_level will be increased inside parallel
5476  }
5477  hot_teams = master->th.th_hot_teams;
5478  if (level < __kmp_hot_teams_max_level) {
5479  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5480  use_hot_team = 1;
5481  }
5482  }
5483 #endif // KMP_NESTED_HOT_TEAMS
5484 
5485  /* team is done working */
5486  TCW_SYNC_PTR(team->t.t_pkfn,
5487  NULL); // Important for Debugging Support Library.
5488 #if KMP_OS_WINDOWS
5489  team->t.t_copyin_counter = 0; // init counter for possible reuse
5490 #endif
5491  // Do not reset pointer to parent team to NULL for hot teams.
5492 
5493  /* if we are non-hot team, release our threads */
5494  if (!use_hot_team) {
5495  if (__kmp_tasking_mode != tskm_immediate_exec) {
5496  // Wait for threads to reach reapable state
5497  for (f = 1; f < team->t.t_nproc; ++f) {
5498  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5499  kmp_info_t *th = team->t.t_threads[f];
5500  volatile kmp_uint32 *state = &th->th.th_reap_state;
5501  while (*state != KMP_SAFE_TO_REAP) {
5502 #if KMP_OS_WINDOWS
5503  // On Windows a thread can be killed at any time, check this
5504  DWORD ecode;
5505  if (!__kmp_is_thread_alive(th, &ecode)) {
5506  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5507  break;
5508  }
5509 #endif
5510  // first check if thread is sleeping
5511  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5512  if (fl.is_sleeping())
5513  fl.resume(__kmp_gtid_from_thread(th));
5514  KMP_CPU_PAUSE();
5515  }
5516  }
5517 
5518  // Delete task teams
5519  int tt_idx;
5520  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5521  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5522  if (task_team != NULL) {
5523  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5524  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5525  team->t.t_threads[f]->th.th_task_team = NULL;
5526  }
5527  KA_TRACE(
5528  20,
5529  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5530  __kmp_get_gtid(), task_team, team->t.t_id));
5531 #if KMP_NESTED_HOT_TEAMS
5532  __kmp_free_task_team(master, task_team);
5533 #endif
5534  team->t.t_task_team[tt_idx] = NULL;
5535  }
5536  }
5537  }
5538 
5539  // Reset pointer to parent team only for non-hot teams.
5540  team->t.t_parent = NULL;
5541  team->t.t_level = 0;
5542  team->t.t_active_level = 0;
5543 
5544  /* free the worker threads */
5545  for (f = 1; f < team->t.t_nproc; ++f) {
5546  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5547  __kmp_free_thread(team->t.t_threads[f]);
5548  team->t.t_threads[f] = NULL;
5549  }
5550 
5551  /* put the team back in the team pool */
5552  /* TODO limit size of team pool, call reap_team if pool too large */
5553  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5554  __kmp_team_pool = (volatile kmp_team_t *)team;
5555  } else { // Check if team was created for the masters in a teams construct
5556  // See if first worker is a CG root
5557  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5558  team->t.t_threads[1]->th.th_cg_roots);
5559  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5560  // Clean up the CG root nodes on workers so that this team can be re-used
5561  for (f = 1; f < team->t.t_nproc; ++f) {
5562  kmp_info_t *thr = team->t.t_threads[f];
5563  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5564  thr->th.th_cg_roots->cg_root == thr);
5565  // Pop current CG root off list
5566  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5567  thr->th.th_cg_roots = tmp->up;
5568  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5569  " up to node %p. cg_nthreads was %d\n",
5570  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5571  __kmp_free(tmp);
5572  // Restore current task's thread_limit from CG root
5573  if (thr->th.th_cg_roots)
5574  thr->th.th_current_task->td_icvs.thread_limit =
5575  thr->th.th_cg_roots->cg_thread_limit;
5576  }
5577  }
5578  }
5579 
5580  KMP_MB();
5581 }
5582 
5583 /* reap the team. destroy it, reclaim all its resources and free its memory */
5584 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5585  kmp_team_t *next_pool = team->t.t_next_pool;
5586 
5587  KMP_DEBUG_ASSERT(team);
5588  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5589  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5590  KMP_DEBUG_ASSERT(team->t.t_threads);
5591  KMP_DEBUG_ASSERT(team->t.t_argv);
5592 
5593  /* TODO clean the threads that are a part of this? */
5594 
5595  /* free stuff */
5596  __kmp_free_team_arrays(team);
5597  if (team->t.t_argv != &team->t.t_inline_argv[0])
5598  __kmp_free((void *)team->t.t_argv);
5599  __kmp_free(team);
5600 
5601  KMP_MB();
5602  return next_pool;
5603 }
5604 
5605 // Free the thread. Don't reap it, just place it on the pool of available
5606 // threads.
5607 //
5608 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5609 // binding for the affinity mechanism to be useful.
5610 //
5611 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5612 // However, we want to avoid a potential performance problem by always
5613 // scanning through the list to find the correct point at which to insert
5614 // the thread (potential N**2 behavior). To do this we keep track of the
5615 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5616 // With single-level parallelism, threads will always be added to the tail
5617 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5618 // parallelism, all bets are off and we may need to scan through the entire
5619 // free list.
5620 //
5621 // This change also has a potentially large performance benefit, for some
5622 // applications. Previously, as threads were freed from the hot team, they
5623 // would be placed back on the free list in inverse order. If the hot team
5624 // grew back to it's original size, then the freed thread would be placed
5625 // back on the hot team in reverse order. This could cause bad cache
5626 // locality problems on programs where the size of the hot team regularly
5627 // grew and shrunk.
5628 //
5629 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5630 void __kmp_free_thread(kmp_info_t *this_th) {
5631  int gtid;
5632  kmp_info_t **scan;
5633 
5634  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5635  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5636 
5637  KMP_DEBUG_ASSERT(this_th);
5638 
5639  // When moving thread to pool, switch thread to wait on own b_go flag, and
5640  // uninitialized (NULL team).
5641  int b;
5642  kmp_balign_t *balign = this_th->th.th_bar;
5643  for (b = 0; b < bs_last_barrier; ++b) {
5644  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5645  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5646  balign[b].bb.team = NULL;
5647  balign[b].bb.leaf_kids = 0;
5648  }
5649  this_th->th.th_task_state = 0;
5650  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5651 
5652  /* put thread back on the free pool */
5653  TCW_PTR(this_th->th.th_team, NULL);
5654  TCW_PTR(this_th->th.th_root, NULL);
5655  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5656 
5657  while (this_th->th.th_cg_roots) {
5658  this_th->th.th_cg_roots->cg_nthreads--;
5659  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5660  " %p of thread %p to %d\n",
5661  this_th, this_th->th.th_cg_roots,
5662  this_th->th.th_cg_roots->cg_root,
5663  this_th->th.th_cg_roots->cg_nthreads));
5664  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5665  if (tmp->cg_root == this_th) { // Thread is a cg_root
5666  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5667  KA_TRACE(
5668  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5669  this_th->th.th_cg_roots = tmp->up;
5670  __kmp_free(tmp);
5671  } else { // Worker thread
5672  this_th->th.th_cg_roots = NULL;
5673  break;
5674  }
5675  }
5676 
5677  /* If the implicit task assigned to this thread can be used by other threads
5678  * -> multiple threads can share the data and try to free the task at
5679  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5680  * with higher probability when hot team is disabled but can occurs even when
5681  * the hot team is enabled */
5682  __kmp_free_implicit_task(this_th);
5683  this_th->th.th_current_task = NULL;
5684 
5685  // If the __kmp_thread_pool_insert_pt is already past the new insert
5686  // point, then we need to re-scan the entire list.
5687  gtid = this_th->th.th_info.ds.ds_gtid;
5688  if (__kmp_thread_pool_insert_pt != NULL) {
5689  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5690  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5691  __kmp_thread_pool_insert_pt = NULL;
5692  }
5693  }
5694 
5695  // Scan down the list to find the place to insert the thread.
5696  // scan is the address of a link in the list, possibly the address of
5697  // __kmp_thread_pool itself.
5698  //
5699  // In the absence of nested parallism, the for loop will have 0 iterations.
5700  if (__kmp_thread_pool_insert_pt != NULL) {
5701  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5702  } else {
5703  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5704  }
5705  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5706  scan = &((*scan)->th.th_next_pool))
5707  ;
5708 
5709  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5710  // to its address.
5711  TCW_PTR(this_th->th.th_next_pool, *scan);
5712  __kmp_thread_pool_insert_pt = *scan = this_th;
5713  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5714  (this_th->th.th_info.ds.ds_gtid <
5715  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5716  TCW_4(this_th->th.th_in_pool, TRUE);
5717  __kmp_suspend_initialize_thread(this_th);
5718  __kmp_lock_suspend_mx(this_th);
5719  if (this_th->th.th_active == TRUE) {
5720  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5721  this_th->th.th_active_in_pool = TRUE;
5722  }
5723 #if KMP_DEBUG
5724  else {
5725  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5726  }
5727 #endif
5728  __kmp_unlock_suspend_mx(this_th);
5729 
5730  TCW_4(__kmp_nth, __kmp_nth - 1);
5731 
5732 #ifdef KMP_ADJUST_BLOCKTIME
5733  /* Adjust blocktime back to user setting or default if necessary */
5734  /* Middle initialization might never have occurred */
5735  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5736  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5737  if (__kmp_nth <= __kmp_avail_proc) {
5738  __kmp_zero_bt = FALSE;
5739  }
5740  }
5741 #endif /* KMP_ADJUST_BLOCKTIME */
5742 
5743  KMP_MB();
5744 }
5745 
5746 /* ------------------------------------------------------------------------ */
5747 
5748 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5749  int gtid = this_thr->th.th_info.ds.ds_gtid;
5750  /* void *stack_data;*/
5751  kmp_team_t *(*volatile pteam);
5752 
5753  KMP_MB();
5754  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5755 
5756  if (__kmp_env_consistency_check) {
5757  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5758  }
5759 
5760 #if OMPT_SUPPORT
5761  ompt_data_t *thread_data;
5762  if (ompt_enabled.enabled) {
5763  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5764  *thread_data = ompt_data_none;
5765 
5766  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5767  this_thr->th.ompt_thread_info.wait_id = 0;
5768  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5769  if (ompt_enabled.ompt_callback_thread_begin) {
5770  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5771  ompt_thread_worker, thread_data);
5772  }
5773  }
5774 #endif
5775 
5776 #if OMPT_SUPPORT
5777  if (ompt_enabled.enabled) {
5778  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5779  }
5780 #endif
5781  /* This is the place where threads wait for work */
5782  while (!TCR_4(__kmp_global.g.g_done)) {
5783  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5784  KMP_MB();
5785 
5786  /* wait for work to do */
5787  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5788 
5789  /* No tid yet since not part of a team */
5790  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5791 
5792 #if OMPT_SUPPORT
5793  if (ompt_enabled.enabled) {
5794  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5795  }
5796 #endif
5797 
5798  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5799 
5800  /* have we been allocated? */
5801  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5802  /* we were just woken up, so run our new task */
5803  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5804  int rc;
5805  KA_TRACE(20,
5806  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5807  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5808  (*pteam)->t.t_pkfn));
5809 
5810  updateHWFPControl(*pteam);
5811 
5812 #if OMPT_SUPPORT
5813  if (ompt_enabled.enabled) {
5814  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5815  }
5816 #endif
5817 
5818  rc = (*pteam)->t.t_invoke(gtid);
5819  KMP_ASSERT(rc);
5820 
5821  KMP_MB();
5822  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5823  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5824  (*pteam)->t.t_pkfn));
5825  }
5826 #if OMPT_SUPPORT
5827  if (ompt_enabled.enabled) {
5828  /* no frame set while outside task */
5829  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5830 
5831  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5832  }
5833 #endif
5834  /* join barrier after parallel region */
5835  __kmp_join_barrier(gtid);
5836  }
5837  }
5838  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5839 
5840 #if OMPT_SUPPORT
5841  if (ompt_enabled.ompt_callback_thread_end) {
5842  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5843  }
5844 #endif
5845 
5846  this_thr->th.th_task_team = NULL;
5847  /* run the destructors for the threadprivate data for this thread */
5848  __kmp_common_destroy_gtid(gtid);
5849 
5850  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5851  KMP_MB();
5852  return this_thr;
5853 }
5854 
5855 /* ------------------------------------------------------------------------ */
5856 
5857 void __kmp_internal_end_dest(void *specific_gtid) {
5858 #if KMP_COMPILER_ICC
5859 #pragma warning(push)
5860 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5861 // significant bits
5862 #endif
5863  // Make sure no significant bits are lost
5864  int gtid = (kmp_intptr_t)specific_gtid - 1;
5865 #if KMP_COMPILER_ICC
5866 #pragma warning(pop)
5867 #endif
5868 
5869  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5870  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5871  * this is because 0 is reserved for the nothing-stored case */
5872 
5873  /* josh: One reason for setting the gtid specific data even when it is being
5874  destroyed by pthread is to allow gtid lookup through thread specific data
5875  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5876  that gets executed in the call to __kmp_internal_end_thread, actually
5877  gets the gtid through the thread specific data. Setting it here seems
5878  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5879  to run smoothly.
5880  todo: get rid of this after we remove the dependence on
5881  __kmp_gtid_get_specific */
5882  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5883  __kmp_gtid_set_specific(gtid);
5884 #ifdef KMP_TDATA_GTID
5885  __kmp_gtid = gtid;
5886 #endif
5887  __kmp_internal_end_thread(gtid);
5888 }
5889 
5890 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5891 
5892 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5893 // destructors work perfectly, but in real libomp.so I have no evidence it is
5894 // ever called. However, -fini linker option in makefile.mk works fine.
5895 
5896 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5897  __kmp_internal_end_atexit();
5898 }
5899 
5900 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5901 
5902 #endif
5903 
5904 /* [Windows] josh: when the atexit handler is called, there may still be more
5905  than one thread alive */
5906 void __kmp_internal_end_atexit(void) {
5907  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5908  /* [Windows]
5909  josh: ideally, we want to completely shutdown the library in this atexit
5910  handler, but stat code that depends on thread specific data for gtid fails
5911  because that data becomes unavailable at some point during the shutdown, so
5912  we call __kmp_internal_end_thread instead. We should eventually remove the
5913  dependency on __kmp_get_specific_gtid in the stat code and use
5914  __kmp_internal_end_library to cleanly shutdown the library.
5915 
5916  // TODO: Can some of this comment about GVS be removed?
5917  I suspect that the offending stat code is executed when the calling thread
5918  tries to clean up a dead root thread's data structures, resulting in GVS
5919  code trying to close the GVS structures for that thread, but since the stat
5920  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5921  the calling thread is cleaning up itself instead of another thread, it get
5922  confused. This happens because allowing a thread to unregister and cleanup
5923  another thread is a recent modification for addressing an issue.
5924  Based on the current design (20050722), a thread may end up
5925  trying to unregister another thread only if thread death does not trigger
5926  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5927  thread specific data destructor function to detect thread death. For
5928  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5929  is nothing. Thus, the workaround is applicable only for Windows static
5930  stat library. */
5931  __kmp_internal_end_library(-1);
5932 #if KMP_OS_WINDOWS
5933  __kmp_close_console();
5934 #endif
5935 }
5936 
5937 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5938  // It is assumed __kmp_forkjoin_lock is acquired.
5939 
5940  int gtid;
5941 
5942  KMP_DEBUG_ASSERT(thread != NULL);
5943 
5944  gtid = thread->th.th_info.ds.ds_gtid;
5945 
5946  if (!is_root) {
5947  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5948  /* Assume the threads are at the fork barrier here */
5949  KA_TRACE(
5950  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5951  gtid));
5952  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5953  * (GEH) */
5954  ANNOTATE_HAPPENS_BEFORE(thread);
5955  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5956  __kmp_release_64(&flag);
5957  }
5958 
5959  // Terminate OS thread.
5960  __kmp_reap_worker(thread);
5961 
5962  // The thread was killed asynchronously. If it was actively
5963  // spinning in the thread pool, decrement the global count.
5964  //
5965  // There is a small timing hole here - if the worker thread was just waking
5966  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5967  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5968  // the global counter might not get updated.
5969  //
5970  // Currently, this can only happen as the library is unloaded,
5971  // so there are no harmful side effects.
5972  if (thread->th.th_active_in_pool) {
5973  thread->th.th_active_in_pool = FALSE;
5974  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5975  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5976  }
5977  }
5978 
5979  __kmp_free_implicit_task(thread);
5980 
5981 // Free the fast memory for tasking
5982 #if USE_FAST_MEMORY
5983  __kmp_free_fast_memory(thread);
5984 #endif /* USE_FAST_MEMORY */
5985 
5986  __kmp_suspend_uninitialize_thread(thread);
5987 
5988  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5989  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5990 
5991  --__kmp_all_nth;
5992 // __kmp_nth was decremented when thread is added to the pool.
5993 
5994 #ifdef KMP_ADJUST_BLOCKTIME
5995  /* Adjust blocktime back to user setting or default if necessary */
5996  /* Middle initialization might never have occurred */
5997  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5998  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5999  if (__kmp_nth <= __kmp_avail_proc) {
6000  __kmp_zero_bt = FALSE;
6001  }
6002  }
6003 #endif /* KMP_ADJUST_BLOCKTIME */
6004 
6005  /* free the memory being used */
6006  if (__kmp_env_consistency_check) {
6007  if (thread->th.th_cons) {
6008  __kmp_free_cons_stack(thread->th.th_cons);
6009  thread->th.th_cons = NULL;
6010  }
6011  }
6012 
6013  if (thread->th.th_pri_common != NULL) {
6014  __kmp_free(thread->th.th_pri_common);
6015  thread->th.th_pri_common = NULL;
6016  }
6017 
6018  if (thread->th.th_task_state_memo_stack != NULL) {
6019  __kmp_free(thread->th.th_task_state_memo_stack);
6020  thread->th.th_task_state_memo_stack = NULL;
6021  }
6022 
6023 #if KMP_USE_BGET
6024  if (thread->th.th_local.bget_data != NULL) {
6025  __kmp_finalize_bget(thread);
6026  }
6027 #endif
6028 
6029 #if KMP_AFFINITY_SUPPORTED
6030  if (thread->th.th_affin_mask != NULL) {
6031  KMP_CPU_FREE(thread->th.th_affin_mask);
6032  thread->th.th_affin_mask = NULL;
6033  }
6034 #endif /* KMP_AFFINITY_SUPPORTED */
6035 
6036 #if KMP_USE_HIER_SCHED
6037  if (thread->th.th_hier_bar_data != NULL) {
6038  __kmp_free(thread->th.th_hier_bar_data);
6039  thread->th.th_hier_bar_data = NULL;
6040  }
6041 #endif
6042 
6043  __kmp_reap_team(thread->th.th_serial_team);
6044  thread->th.th_serial_team = NULL;
6045  __kmp_free(thread);
6046 
6047  KMP_MB();
6048 
6049 } // __kmp_reap_thread
6050 
6051 static void __kmp_internal_end(void) {
6052  int i;
6053 
6054  /* First, unregister the library */
6055  __kmp_unregister_library();
6056 
6057 #if KMP_OS_WINDOWS
6058  /* In Win static library, we can't tell when a root actually dies, so we
6059  reclaim the data structures for any root threads that have died but not
6060  unregistered themselves, in order to shut down cleanly.
6061  In Win dynamic library we also can't tell when a thread dies. */
6062  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6063 // dead roots
6064 #endif
6065 
6066  for (i = 0; i < __kmp_threads_capacity; i++)
6067  if (__kmp_root[i])
6068  if (__kmp_root[i]->r.r_active)
6069  break;
6070  KMP_MB(); /* Flush all pending memory write invalidates. */
6071  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6072 
6073  if (i < __kmp_threads_capacity) {
6074 #if KMP_USE_MONITOR
6075  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6076  KMP_MB(); /* Flush all pending memory write invalidates. */
6077 
6078  // Need to check that monitor was initialized before reaping it. If we are
6079  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6080  // __kmp_monitor will appear to contain valid data, but it is only valid in
6081  // the parent process, not the child.
6082  // New behavior (201008): instead of keying off of the flag
6083  // __kmp_init_parallel, the monitor thread creation is keyed off
6084  // of the new flag __kmp_init_monitor.
6085  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6086  if (TCR_4(__kmp_init_monitor)) {
6087  __kmp_reap_monitor(&__kmp_monitor);
6088  TCW_4(__kmp_init_monitor, 0);
6089  }
6090  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6091  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6092 #endif // KMP_USE_MONITOR
6093  } else {
6094 /* TODO move this to cleanup code */
6095 #ifdef KMP_DEBUG
6096  /* make sure that everything has properly ended */
6097  for (i = 0; i < __kmp_threads_capacity; i++) {
6098  if (__kmp_root[i]) {
6099  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6100  // there can be uber threads alive here
6101  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6102  }
6103  }
6104 #endif
6105 
6106  KMP_MB();
6107 
6108  // Reap the worker threads.
6109  // This is valid for now, but be careful if threads are reaped sooner.
6110  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6111  // Get the next thread from the pool.
6112  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6113  __kmp_thread_pool = thread->th.th_next_pool;
6114  // Reap it.
6115  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6116  thread->th.th_next_pool = NULL;
6117  thread->th.th_in_pool = FALSE;
6118  thread->th.th_active_in_pool = FALSE;
6119  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6120  __kmp_reap_thread(thread, 0);
6121  }
6122  __kmp_thread_pool_insert_pt = NULL;
6123 
6124  // Reap teams.
6125  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6126  // Get the next team from the pool.
6127  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6128  __kmp_team_pool = team->t.t_next_pool;
6129  // Reap it.
6130  team->t.t_next_pool = NULL;
6131  __kmp_reap_team(team);
6132  }
6133 
6134  __kmp_reap_task_teams();
6135 
6136 #if KMP_OS_UNIX
6137  // Threads that are not reaped should not access any resources since they
6138  // are going to be deallocated soon, so the shutdown sequence should wait
6139  // until all threads either exit the final spin-waiting loop or begin
6140  // sleeping after the given blocktime.
6141  for (i = 0; i < __kmp_threads_capacity; i++) {
6142  kmp_info_t *thr = __kmp_threads[i];
6143  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6144  KMP_CPU_PAUSE();
6145  }
6146 #endif
6147 
6148  for (i = 0; i < __kmp_threads_capacity; ++i) {
6149  // TBD: Add some checking...
6150  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6151  }
6152 
6153  /* Make sure all threadprivate destructors get run by joining with all
6154  worker threads before resetting this flag */
6155  TCW_SYNC_4(__kmp_init_common, FALSE);
6156 
6157  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6158  KMP_MB();
6159 
6160 #if KMP_USE_MONITOR
6161  // See note above: One of the possible fixes for CQ138434 / CQ140126
6162  //
6163  // FIXME: push both code fragments down and CSE them?
6164  // push them into __kmp_cleanup() ?
6165  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6166  if (TCR_4(__kmp_init_monitor)) {
6167  __kmp_reap_monitor(&__kmp_monitor);
6168  TCW_4(__kmp_init_monitor, 0);
6169  }
6170  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6171  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6172 #endif
6173  } /* else !__kmp_global.t_active */
6174  TCW_4(__kmp_init_gtid, FALSE);
6175  KMP_MB(); /* Flush all pending memory write invalidates. */
6176 
6177  __kmp_cleanup();
6178 #if OMPT_SUPPORT
6179  ompt_fini();
6180 #endif
6181 }
6182 
6183 void __kmp_internal_end_library(int gtid_req) {
6184  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6185  /* this shouldn't be a race condition because __kmp_internal_end() is the
6186  only place to clear __kmp_serial_init */
6187  /* we'll check this later too, after we get the lock */
6188  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6189  // redundaant, because the next check will work in any case.
6190  if (__kmp_global.g.g_abort) {
6191  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6192  /* TODO abort? */
6193  return;
6194  }
6195  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6196  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6197  return;
6198  }
6199 
6200  KMP_MB(); /* Flush all pending memory write invalidates. */
6201 
6202  /* find out who we are and what we should do */
6203  {
6204  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6205  KA_TRACE(
6206  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6207  if (gtid == KMP_GTID_SHUTDOWN) {
6208  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6209  "already shutdown\n"));
6210  return;
6211  } else if (gtid == KMP_GTID_MONITOR) {
6212  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6213  "registered, or system shutdown\n"));
6214  return;
6215  } else if (gtid == KMP_GTID_DNE) {
6216  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6217  "shutdown\n"));
6218  /* we don't know who we are, but we may still shutdown the library */
6219  } else if (KMP_UBER_GTID(gtid)) {
6220  /* unregister ourselves as an uber thread. gtid is no longer valid */
6221  if (__kmp_root[gtid]->r.r_active) {
6222  __kmp_global.g.g_abort = -1;
6223  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6224  KA_TRACE(10,
6225  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6226  gtid));
6227  return;
6228  } else {
6229  KA_TRACE(
6230  10,
6231  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6232  __kmp_unregister_root_current_thread(gtid);
6233  }
6234  } else {
6235 /* worker threads may call this function through the atexit handler, if they
6236  * call exit() */
6237 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6238  TODO: do a thorough shutdown instead */
6239 #ifdef DUMP_DEBUG_ON_EXIT
6240  if (__kmp_debug_buf)
6241  __kmp_dump_debug_buffer();
6242 #endif
6243  return;
6244  }
6245  }
6246  /* synchronize the termination process */
6247  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6248 
6249  /* have we already finished */
6250  if (__kmp_global.g.g_abort) {
6251  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6252  /* TODO abort? */
6253  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6254  return;
6255  }
6256  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6257  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6258  return;
6259  }
6260 
6261  /* We need this lock to enforce mutex between this reading of
6262  __kmp_threads_capacity and the writing by __kmp_register_root.
6263  Alternatively, we can use a counter of roots that is atomically updated by
6264  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6265  __kmp_internal_end_*. */
6266  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6267 
6268  /* now we can safely conduct the actual termination */
6269  __kmp_internal_end();
6270 
6271  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6272  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6273 
6274  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6275 
6276 #ifdef DUMP_DEBUG_ON_EXIT
6277  if (__kmp_debug_buf)
6278  __kmp_dump_debug_buffer();
6279 #endif
6280 
6281 #if KMP_OS_WINDOWS
6282  __kmp_close_console();
6283 #endif
6284 
6285  __kmp_fini_allocator();
6286 
6287 } // __kmp_internal_end_library
6288 
6289 void __kmp_internal_end_thread(int gtid_req) {
6290  int i;
6291 
6292  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6293  /* this shouldn't be a race condition because __kmp_internal_end() is the
6294  * only place to clear __kmp_serial_init */
6295  /* we'll check this later too, after we get the lock */
6296  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6297  // redundant, because the next check will work in any case.
6298  if (__kmp_global.g.g_abort) {
6299  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6300  /* TODO abort? */
6301  return;
6302  }
6303  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6304  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6305  return;
6306  }
6307 
6308  KMP_MB(); /* Flush all pending memory write invalidates. */
6309 
6310  /* find out who we are and what we should do */
6311  {
6312  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6313  KA_TRACE(10,
6314  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6315  if (gtid == KMP_GTID_SHUTDOWN) {
6316  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6317  "already shutdown\n"));
6318  return;
6319  } else if (gtid == KMP_GTID_MONITOR) {
6320  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6321  "registered, or system shutdown\n"));
6322  return;
6323  } else if (gtid == KMP_GTID_DNE) {
6324  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6325  "shutdown\n"));
6326  return;
6327  /* we don't know who we are */
6328  } else if (KMP_UBER_GTID(gtid)) {
6329  /* unregister ourselves as an uber thread. gtid is no longer valid */
6330  if (__kmp_root[gtid]->r.r_active) {
6331  __kmp_global.g.g_abort = -1;
6332  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333  KA_TRACE(10,
6334  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6335  gtid));
6336  return;
6337  } else {
6338  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6339  gtid));
6340  __kmp_unregister_root_current_thread(gtid);
6341  }
6342  } else {
6343  /* just a worker thread, let's leave */
6344  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6345 
6346  if (gtid >= 0) {
6347  __kmp_threads[gtid]->th.th_task_team = NULL;
6348  }
6349 
6350  KA_TRACE(10,
6351  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6352  gtid));
6353  return;
6354  }
6355  }
6356 #if KMP_DYNAMIC_LIB
6357 #if OMP_50_ENABLED
6358  if (__kmp_pause_status != kmp_hard_paused)
6359 #endif
6360  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6361  // because we will better shutdown later in the library destructor.
6362  {
6363  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6364  return;
6365  }
6366 #endif
6367  /* synchronize the termination process */
6368  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6369 
6370  /* have we already finished */
6371  if (__kmp_global.g.g_abort) {
6372  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6373  /* TODO abort? */
6374  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6375  return;
6376  }
6377  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6378  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6379  return;
6380  }
6381 
6382  /* We need this lock to enforce mutex between this reading of
6383  __kmp_threads_capacity and the writing by __kmp_register_root.
6384  Alternatively, we can use a counter of roots that is atomically updated by
6385  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6386  __kmp_internal_end_*. */
6387 
6388  /* should we finish the run-time? are all siblings done? */
6389  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6390 
6391  for (i = 0; i < __kmp_threads_capacity; ++i) {
6392  if (KMP_UBER_GTID(i)) {
6393  KA_TRACE(
6394  10,
6395  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6396  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6397  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6398  return;
6399  }
6400  }
6401 
6402  /* now we can safely conduct the actual termination */
6403 
6404  __kmp_internal_end();
6405 
6406  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6407  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6408 
6409  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6410 
6411 #ifdef DUMP_DEBUG_ON_EXIT
6412  if (__kmp_debug_buf)
6413  __kmp_dump_debug_buffer();
6414 #endif
6415 } // __kmp_internal_end_thread
6416 
6417 // -----------------------------------------------------------------------------
6418 // Library registration stuff.
6419 
6420 static long __kmp_registration_flag = 0;
6421 // Random value used to indicate library initialization.
6422 static char *__kmp_registration_str = NULL;
6423 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6424 
6425 static inline char *__kmp_reg_status_name() {
6426  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6427  each thread. If registration and unregistration go in different threads
6428  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6429  env var can not be found, because the name will contain different pid. */
6430  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6431 } // __kmp_reg_status_get
6432 
6433 void __kmp_register_library_startup(void) {
6434 
6435  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6436  int done = 0;
6437  union {
6438  double dtime;
6439  long ltime;
6440  } time;
6441 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6442  __kmp_initialize_system_tick();
6443 #endif
6444  __kmp_read_system_time(&time.dtime);
6445  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6446  __kmp_registration_str =
6447  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6448  __kmp_registration_flag, KMP_LIBRARY_FILE);
6449 
6450  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6451  __kmp_registration_str));
6452 
6453  while (!done) {
6454 
6455  char *value = NULL; // Actual value of the environment variable.
6456 
6457  // Set environment variable, but do not overwrite if it is exist.
6458  __kmp_env_set(name, __kmp_registration_str, 0);
6459  // Check the variable is written.
6460  value = __kmp_env_get(name);
6461  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6462 
6463  done = 1; // Ok, environment variable set successfully, exit the loop.
6464 
6465  } else {
6466 
6467  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6468  // Check whether it alive or dead.
6469  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6470  char *tail = value;
6471  char *flag_addr_str = NULL;
6472  char *flag_val_str = NULL;
6473  char const *file_name = NULL;
6474  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6475  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6476  file_name = tail;
6477  if (tail != NULL) {
6478  long *flag_addr = 0;
6479  long flag_val = 0;
6480  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6481  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6482  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6483  // First, check whether environment-encoded address is mapped into
6484  // addr space.
6485  // If so, dereference it to see if it still has the right value.
6486  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6487  neighbor = 1;
6488  } else {
6489  // If not, then we know the other copy of the library is no longer
6490  // running.
6491  neighbor = 2;
6492  }
6493  }
6494  }
6495  switch (neighbor) {
6496  case 0: // Cannot parse environment variable -- neighbor status unknown.
6497  // Assume it is the incompatible format of future version of the
6498  // library. Assume the other library is alive.
6499  // WARN( ... ); // TODO: Issue a warning.
6500  file_name = "unknown library";
6501  KMP_FALLTHROUGH();
6502  // Attention! Falling to the next case. That's intentional.
6503  case 1: { // Neighbor is alive.
6504  // Check it is allowed.
6505  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6506  if (!__kmp_str_match_true(duplicate_ok)) {
6507  // That's not allowed. Issue fatal error.
6508  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6509  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6510  }
6511  KMP_INTERNAL_FREE(duplicate_ok);
6512  __kmp_duplicate_library_ok = 1;
6513  done = 1; // Exit the loop.
6514  } break;
6515  case 2: { // Neighbor is dead.
6516  // Clear the variable and try to register library again.
6517  __kmp_env_unset(name);
6518  } break;
6519  default: { KMP_DEBUG_ASSERT(0); } break;
6520  }
6521  }
6522  KMP_INTERNAL_FREE((void *)value);
6523  }
6524  KMP_INTERNAL_FREE((void *)name);
6525 
6526 } // func __kmp_register_library_startup
6527 
6528 void __kmp_unregister_library(void) {
6529 
6530  char *name = __kmp_reg_status_name();
6531  char *value = __kmp_env_get(name);
6532 
6533  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6534  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6535  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6536  // Ok, this is our variable. Delete it.
6537  __kmp_env_unset(name);
6538  }
6539 
6540  KMP_INTERNAL_FREE(__kmp_registration_str);
6541  KMP_INTERNAL_FREE(value);
6542  KMP_INTERNAL_FREE(name);
6543 
6544  __kmp_registration_flag = 0;
6545  __kmp_registration_str = NULL;
6546 
6547 } // __kmp_unregister_library
6548 
6549 // End of Library registration stuff.
6550 // -----------------------------------------------------------------------------
6551 
6552 #if KMP_MIC_SUPPORTED
6553 
6554 static void __kmp_check_mic_type() {
6555  kmp_cpuid_t cpuid_state = {0};
6556  kmp_cpuid_t *cs_p = &cpuid_state;
6557  __kmp_x86_cpuid(1, 0, cs_p);
6558  // We don't support mic1 at the moment
6559  if ((cs_p->eax & 0xff0) == 0xB10) {
6560  __kmp_mic_type = mic2;
6561  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6562  __kmp_mic_type = mic3;
6563  } else {
6564  __kmp_mic_type = non_mic;
6565  }
6566 }
6567 
6568 #endif /* KMP_MIC_SUPPORTED */
6569 
6570 static void __kmp_do_serial_initialize(void) {
6571  int i, gtid;
6572  int size;
6573 
6574  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6575 
6576  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6577  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6578  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6579  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6580  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6581 
6582 #if OMPT_SUPPORT
6583  ompt_pre_init();
6584 #endif
6585 
6586  __kmp_validate_locks();
6587 
6588  /* Initialize internal memory allocator */
6589  __kmp_init_allocator();
6590 
6591  /* Register the library startup via an environment variable and check to see
6592  whether another copy of the library is already registered. */
6593 
6594  __kmp_register_library_startup();
6595 
6596  /* TODO reinitialization of library */
6597  if (TCR_4(__kmp_global.g.g_done)) {
6598  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6599  }
6600 
6601  __kmp_global.g.g_abort = 0;
6602  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6603 
6604 /* initialize the locks */
6605 #if KMP_USE_ADAPTIVE_LOCKS
6606 #if KMP_DEBUG_ADAPTIVE_LOCKS
6607  __kmp_init_speculative_stats();
6608 #endif
6609 #endif
6610 #if KMP_STATS_ENABLED
6611  __kmp_stats_init();
6612 #endif
6613  __kmp_init_lock(&__kmp_global_lock);
6614  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6615  __kmp_init_lock(&__kmp_debug_lock);
6616  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6617  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6618  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6619  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6620  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6621  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6622  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6623  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6624  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6625  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6626  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6627  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6628  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6629  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6630  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6631 #if KMP_USE_MONITOR
6632  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6633 #endif
6634  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6635 
6636  /* conduct initialization and initial setup of configuration */
6637 
6638  __kmp_runtime_initialize();
6639 
6640 #if KMP_MIC_SUPPORTED
6641  __kmp_check_mic_type();
6642 #endif
6643 
6644 // Some global variable initialization moved here from kmp_env_initialize()
6645 #ifdef KMP_DEBUG
6646  kmp_diag = 0;
6647 #endif
6648  __kmp_abort_delay = 0;
6649 
6650  // From __kmp_init_dflt_team_nth()
6651  /* assume the entire machine will be used */
6652  __kmp_dflt_team_nth_ub = __kmp_xproc;
6653  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6654  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6655  }
6656  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6657  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6658  }
6659  __kmp_max_nth = __kmp_sys_max_nth;
6660  __kmp_cg_max_nth = __kmp_sys_max_nth;
6661  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6662  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6663  __kmp_teams_max_nth = __kmp_sys_max_nth;
6664  }
6665 
6666  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6667  // part
6668  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6669 #if KMP_USE_MONITOR
6670  __kmp_monitor_wakeups =
6671  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6672  __kmp_bt_intervals =
6673  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6674 #endif
6675  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6676  __kmp_library = library_throughput;
6677  // From KMP_SCHEDULE initialization
6678  __kmp_static = kmp_sch_static_balanced;
6679 // AC: do not use analytical here, because it is non-monotonous
6680 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6681 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6682 // need to repeat assignment
6683 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6684 // bit control and barrier method control parts
6685 #if KMP_FAST_REDUCTION_BARRIER
6686 #define kmp_reduction_barrier_gather_bb ((int)1)
6687 #define kmp_reduction_barrier_release_bb ((int)1)
6688 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6689 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6690 #endif // KMP_FAST_REDUCTION_BARRIER
6691  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6692  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6693  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6694  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6695  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6696 #if KMP_FAST_REDUCTION_BARRIER
6697  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6698  // lin_64 ): hyper,1
6699  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6700  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6701  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6702  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6703  }
6704 #endif // KMP_FAST_REDUCTION_BARRIER
6705  }
6706 #if KMP_FAST_REDUCTION_BARRIER
6707 #undef kmp_reduction_barrier_release_pat
6708 #undef kmp_reduction_barrier_gather_pat
6709 #undef kmp_reduction_barrier_release_bb
6710 #undef kmp_reduction_barrier_gather_bb
6711 #endif // KMP_FAST_REDUCTION_BARRIER
6712 #if KMP_MIC_SUPPORTED
6713  if (__kmp_mic_type == mic2) { // KNC
6714  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6715  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6716  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6717  1; // forkjoin release
6718  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6719  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6720  }
6721 #if KMP_FAST_REDUCTION_BARRIER
6722  if (__kmp_mic_type == mic2) { // KNC
6723  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6724  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6725  }
6726 #endif // KMP_FAST_REDUCTION_BARRIER
6727 #endif // KMP_MIC_SUPPORTED
6728 
6729 // From KMP_CHECKS initialization
6730 #ifdef KMP_DEBUG
6731  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6732 #else
6733  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6734 #endif
6735 
6736  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6737  __kmp_foreign_tp = TRUE;
6738 
6739  __kmp_global.g.g_dynamic = FALSE;
6740  __kmp_global.g.g_dynamic_mode = dynamic_default;
6741 
6742  __kmp_env_initialize(NULL);
6743 
6744 // Print all messages in message catalog for testing purposes.
6745 #ifdef KMP_DEBUG
6746  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6747  if (__kmp_str_match_true(val)) {
6748  kmp_str_buf_t buffer;
6749  __kmp_str_buf_init(&buffer);
6750  __kmp_i18n_dump_catalog(&buffer);
6751  __kmp_printf("%s", buffer.str);
6752  __kmp_str_buf_free(&buffer);
6753  }
6754  __kmp_env_free(&val);
6755 #endif
6756 
6757  __kmp_threads_capacity =
6758  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6759  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6760  __kmp_tp_capacity = __kmp_default_tp_capacity(
6761  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6762 
6763  // If the library is shut down properly, both pools must be NULL. Just in
6764  // case, set them to NULL -- some memory may leak, but subsequent code will
6765  // work even if pools are not freed.
6766  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6767  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6768  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6769  __kmp_thread_pool = NULL;
6770  __kmp_thread_pool_insert_pt = NULL;
6771  __kmp_team_pool = NULL;
6772 
6773  /* Allocate all of the variable sized records */
6774  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6775  * expandable */
6776  /* Since allocation is cache-aligned, just add extra padding at the end */
6777  size =
6778  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6779  CACHE_LINE;
6780  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6781  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6782  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6783 
6784  /* init thread counts */
6785  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6786  0); // Asserts fail if the library is reinitializing and
6787  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6788  __kmp_all_nth = 0;
6789  __kmp_nth = 0;
6790 
6791  /* setup the uber master thread and hierarchy */
6792  gtid = __kmp_register_root(TRUE);
6793  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6794  KMP_ASSERT(KMP_UBER_GTID(gtid));
6795  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6796 
6797  KMP_MB(); /* Flush all pending memory write invalidates. */
6798 
6799  __kmp_common_initialize();
6800 
6801 #if KMP_OS_UNIX
6802  /* invoke the child fork handler */
6803  __kmp_register_atfork();
6804 #endif
6805 
6806 #if !KMP_DYNAMIC_LIB
6807  {
6808  /* Invoke the exit handler when the program finishes, only for static
6809  library. For dynamic library, we already have _fini and DllMain. */
6810  int rc = atexit(__kmp_internal_end_atexit);
6811  if (rc != 0) {
6812  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6813  __kmp_msg_null);
6814  }
6815  }
6816 #endif
6817 
6818 #if KMP_HANDLE_SIGNALS
6819 #if KMP_OS_UNIX
6820  /* NOTE: make sure that this is called before the user installs their own
6821  signal handlers so that the user handlers are called first. this way they
6822  can return false, not call our handler, avoid terminating the library, and
6823  continue execution where they left off. */
6824  __kmp_install_signals(FALSE);
6825 #endif /* KMP_OS_UNIX */
6826 #if KMP_OS_WINDOWS
6827  __kmp_install_signals(TRUE);
6828 #endif /* KMP_OS_WINDOWS */
6829 #endif
6830 
6831  /* we have finished the serial initialization */
6832  __kmp_init_counter++;
6833 
6834  __kmp_init_serial = TRUE;
6835 
6836  if (__kmp_settings) {
6837  __kmp_env_print();
6838  }
6839 
6840 #if OMP_40_ENABLED
6841  if (__kmp_display_env || __kmp_display_env_verbose) {
6842  __kmp_env_print_2();
6843  }
6844 #endif // OMP_40_ENABLED
6845 
6846 #if OMPT_SUPPORT
6847  ompt_post_init();
6848 #endif
6849 
6850  KMP_MB();
6851 
6852  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6853 }
6854 
6855 void __kmp_serial_initialize(void) {
6856  if (__kmp_init_serial) {
6857  return;
6858  }
6859  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6860  if (__kmp_init_serial) {
6861  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6862  return;
6863  }
6864  __kmp_do_serial_initialize();
6865  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6866 }
6867 
6868 static void __kmp_do_middle_initialize(void) {
6869  int i, j;
6870  int prev_dflt_team_nth;
6871 
6872  if (!__kmp_init_serial) {
6873  __kmp_do_serial_initialize();
6874  }
6875 
6876  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6877 
6878  // Save the previous value for the __kmp_dflt_team_nth so that
6879  // we can avoid some reinitialization if it hasn't changed.
6880  prev_dflt_team_nth = __kmp_dflt_team_nth;
6881 
6882 #if KMP_AFFINITY_SUPPORTED
6883  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6884  // number of cores on the machine.
6885  __kmp_affinity_initialize();
6886 
6887  // Run through the __kmp_threads array and set the affinity mask
6888  // for each root thread that is currently registered with the RTL.
6889  for (i = 0; i < __kmp_threads_capacity; i++) {
6890  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6891  __kmp_affinity_set_init_mask(i, TRUE);
6892  }
6893  }
6894 #endif /* KMP_AFFINITY_SUPPORTED */
6895 
6896  KMP_ASSERT(__kmp_xproc > 0);
6897  if (__kmp_avail_proc == 0) {
6898  __kmp_avail_proc = __kmp_xproc;
6899  }
6900 
6901  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6902  // correct them now
6903  j = 0;
6904  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6905  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6906  __kmp_avail_proc;
6907  j++;
6908  }
6909 
6910  if (__kmp_dflt_team_nth == 0) {
6911 #ifdef KMP_DFLT_NTH_CORES
6912  // Default #threads = #cores
6913  __kmp_dflt_team_nth = __kmp_ncores;
6914  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6915  "__kmp_ncores (%d)\n",
6916  __kmp_dflt_team_nth));
6917 #else
6918  // Default #threads = #available OS procs
6919  __kmp_dflt_team_nth = __kmp_avail_proc;
6920  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6921  "__kmp_avail_proc(%d)\n",
6922  __kmp_dflt_team_nth));
6923 #endif /* KMP_DFLT_NTH_CORES */
6924  }
6925 
6926  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6927  __kmp_dflt_team_nth = KMP_MIN_NTH;
6928  }
6929  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6930  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6931  }
6932 
6933  // There's no harm in continuing if the following check fails,
6934  // but it indicates an error in the previous logic.
6935  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6936 
6937  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6938  // Run through the __kmp_threads array and set the num threads icv for each
6939  // root thread that is currently registered with the RTL (which has not
6940  // already explicitly set its nthreads-var with a call to
6941  // omp_set_num_threads()).
6942  for (i = 0; i < __kmp_threads_capacity; i++) {
6943  kmp_info_t *thread = __kmp_threads[i];
6944  if (thread == NULL)
6945  continue;
6946  if (thread->th.th_current_task->td_icvs.nproc != 0)
6947  continue;
6948 
6949  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6950  }
6951  }
6952  KA_TRACE(
6953  20,
6954  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6955  __kmp_dflt_team_nth));
6956 
6957 #ifdef KMP_ADJUST_BLOCKTIME
6958  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6959  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6960  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6961  if (__kmp_nth > __kmp_avail_proc) {
6962  __kmp_zero_bt = TRUE;
6963  }
6964  }
6965 #endif /* KMP_ADJUST_BLOCKTIME */
6966 
6967  /* we have finished middle initialization */
6968  TCW_SYNC_4(__kmp_init_middle, TRUE);
6969 
6970  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6971 }
6972 
6973 void __kmp_middle_initialize(void) {
6974  if (__kmp_init_middle) {
6975  return;
6976  }
6977  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6978  if (__kmp_init_middle) {
6979  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6980  return;
6981  }
6982  __kmp_do_middle_initialize();
6983  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6984 }
6985 
6986 void __kmp_parallel_initialize(void) {
6987  int gtid = __kmp_entry_gtid(); // this might be a new root
6988 
6989  /* synchronize parallel initialization (for sibling) */
6990  if (TCR_4(__kmp_init_parallel))
6991  return;
6992  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6993  if (TCR_4(__kmp_init_parallel)) {
6994  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6995  return;
6996  }
6997 
6998  /* TODO reinitialization after we have already shut down */
6999  if (TCR_4(__kmp_global.g.g_done)) {
7000  KA_TRACE(
7001  10,
7002  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7003  __kmp_infinite_loop();
7004  }
7005 
7006  /* jc: The lock __kmp_initz_lock is already held, so calling
7007  __kmp_serial_initialize would cause a deadlock. So we call
7008  __kmp_do_serial_initialize directly. */
7009  if (!__kmp_init_middle) {
7010  __kmp_do_middle_initialize();
7011  }
7012 
7013 #if OMP_50_ENABLED
7014  __kmp_resume_if_hard_paused();
7015 #endif
7016 
7017  /* begin initialization */
7018  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7019  KMP_ASSERT(KMP_UBER_GTID(gtid));
7020 
7021 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7022  // Save the FP control regs.
7023  // Worker threads will set theirs to these values at thread startup.
7024  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7025  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7026  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7027 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7028 
7029 #if KMP_OS_UNIX
7030 #if KMP_HANDLE_SIGNALS
7031  /* must be after __kmp_serial_initialize */
7032  __kmp_install_signals(TRUE);
7033 #endif
7034 #endif
7035 
7036  __kmp_suspend_initialize();
7037 
7038 #if defined(USE_LOAD_BALANCE)
7039  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7040  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7041  }
7042 #else
7043  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7044  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7045  }
7046 #endif
7047 
7048  if (__kmp_version) {
7049  __kmp_print_version_2();
7050  }
7051 
7052  /* we have finished parallel initialization */
7053  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7054 
7055  KMP_MB();
7056  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7057 
7058  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7059 }
7060 
7061 /* ------------------------------------------------------------------------ */
7062 
7063 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7064  kmp_team_t *team) {
7065  kmp_disp_t *dispatch;
7066 
7067  KMP_MB();
7068 
7069  /* none of the threads have encountered any constructs, yet. */
7070  this_thr->th.th_local.this_construct = 0;
7071 #if KMP_CACHE_MANAGE
7072  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7073 #endif /* KMP_CACHE_MANAGE */
7074  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7075  KMP_DEBUG_ASSERT(dispatch);
7076  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7077  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7078  // this_thr->th.th_info.ds.ds_tid ] );
7079 
7080  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7081 #if OMP_45_ENABLED
7082  dispatch->th_doacross_buf_idx =
7083  0; /* reset the doacross dispatch buffer counter */
7084 #endif
7085  if (__kmp_env_consistency_check)
7086  __kmp_push_parallel(gtid, team->t.t_ident);
7087 
7088  KMP_MB(); /* Flush all pending memory write invalidates. */
7089 }
7090 
7091 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7092  kmp_team_t *team) {
7093  if (__kmp_env_consistency_check)
7094  __kmp_pop_parallel(gtid, team->t.t_ident);
7095 
7096  __kmp_finish_implicit_task(this_thr);
7097 }
7098 
7099 int __kmp_invoke_task_func(int gtid) {
7100  int rc;
7101  int tid = __kmp_tid_from_gtid(gtid);
7102  kmp_info_t *this_thr = __kmp_threads[gtid];
7103  kmp_team_t *team = this_thr->th.th_team;
7104 
7105  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7106 #if USE_ITT_BUILD
7107  if (__itt_stack_caller_create_ptr) {
7108  __kmp_itt_stack_callee_enter(
7109  (__itt_caller)
7110  team->t.t_stack_id); // inform ittnotify about entering user's code
7111  }
7112 #endif /* USE_ITT_BUILD */
7113 #if INCLUDE_SSC_MARKS
7114  SSC_MARK_INVOKING();
7115 #endif
7116 
7117 #if OMPT_SUPPORT
7118  void *dummy;
7119  void **exit_runtime_p;
7120  ompt_data_t *my_task_data;
7121  ompt_data_t *my_parallel_data;
7122  int ompt_team_size;
7123 
7124  if (ompt_enabled.enabled) {
7125  exit_runtime_p = &(
7126  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7127  } else {
7128  exit_runtime_p = &dummy;
7129  }
7130 
7131  my_task_data =
7132  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7133  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7134  if (ompt_enabled.ompt_callback_implicit_task) {
7135  ompt_team_size = team->t.t_nproc;
7136  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7137  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7138  __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7139  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7140  }
7141 #endif
7142 
7143 #if KMP_STATS_ENABLED
7144  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7145  if (previous_state == stats_state_e::TEAMS_REGION) {
7146  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7147  } else {
7148  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7149  }
7150  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7151 #endif
7152 
7153  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7154  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7155 #if OMPT_SUPPORT
7156  ,
7157  exit_runtime_p
7158 #endif
7159  );
7160 #if OMPT_SUPPORT
7161  *exit_runtime_p = NULL;
7162 #endif
7163 
7164 #if KMP_STATS_ENABLED
7165  if (previous_state == stats_state_e::TEAMS_REGION) {
7166  KMP_SET_THREAD_STATE(previous_state);
7167  }
7168  KMP_POP_PARTITIONED_TIMER();
7169 #endif
7170 
7171 #if USE_ITT_BUILD
7172  if (__itt_stack_caller_create_ptr) {
7173  __kmp_itt_stack_callee_leave(
7174  (__itt_caller)
7175  team->t.t_stack_id); // inform ittnotify about leaving user's code
7176  }
7177 #endif /* USE_ITT_BUILD */
7178  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7179 
7180  return rc;
7181 }
7182 
7183 #if OMP_40_ENABLED
7184 void __kmp_teams_master(int gtid) {
7185  // This routine is called by all master threads in teams construct
7186  kmp_info_t *thr = __kmp_threads[gtid];
7187  kmp_team_t *team = thr->th.th_team;
7188  ident_t *loc = team->t.t_ident;
7189  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7190  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7191  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7192  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7193  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7194 
7195  // This thread is a new CG root. Set up the proper variables.
7196  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7197  tmp->cg_root = thr; // Make thr the CG root
7198  // Init to thread limit that was stored when league masters were forked
7199  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7200  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7201  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7202  " cg_threads to 1\n",
7203  thr, tmp));
7204  tmp->up = thr->th.th_cg_roots;
7205  thr->th.th_cg_roots = tmp;
7206 
7207 // Launch league of teams now, but not let workers execute
7208 // (they hang on fork barrier until next parallel)
7209 #if INCLUDE_SSC_MARKS
7210  SSC_MARK_FORKING();
7211 #endif
7212  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7213  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7214  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7215 #if INCLUDE_SSC_MARKS
7216  SSC_MARK_JOINING();
7217 #endif
7218  // If the team size was reduced from the limit, set it to the new size
7219  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7220  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7221  // AC: last parameter "1" eliminates join barrier which won't work because
7222  // worker threads are in a fork barrier waiting for more parallel regions
7223  __kmp_join_call(loc, gtid
7224 #if OMPT_SUPPORT
7225  ,
7226  fork_context_intel
7227 #endif
7228  ,
7229  1);
7230 }
7231 
7232 int __kmp_invoke_teams_master(int gtid) {
7233  kmp_info_t *this_thr = __kmp_threads[gtid];
7234  kmp_team_t *team = this_thr->th.th_team;
7235 #if KMP_DEBUG
7236  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7237  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7238  (void *)__kmp_teams_master);
7239 #endif
7240  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7241  __kmp_teams_master(gtid);
7242  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7243  return 1;
7244 }
7245 #endif /* OMP_40_ENABLED */
7246 
7247 /* this sets the requested number of threads for the next parallel region
7248  encountered by this team. since this should be enclosed in the forkjoin
7249  critical section it should avoid race conditions with assymmetrical nested
7250  parallelism */
7251 
7252 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7253  kmp_info_t *thr = __kmp_threads[gtid];
7254 
7255  if (num_threads > 0)
7256  thr->th.th_set_nproc = num_threads;
7257 }
7258 
7259 #if OMP_40_ENABLED
7260 
7261 /* this sets the requested number of teams for the teams region and/or
7262  the number of threads for the next parallel region encountered */
7263 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7264  int num_threads) {
7265  kmp_info_t *thr = __kmp_threads[gtid];
7266  KMP_DEBUG_ASSERT(num_teams >= 0);
7267  KMP_DEBUG_ASSERT(num_threads >= 0);
7268 
7269  if (num_teams == 0)
7270  num_teams = 1; // default number of teams is 1.
7271  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7272  if (!__kmp_reserve_warn) {
7273  __kmp_reserve_warn = 1;
7274  __kmp_msg(kmp_ms_warning,
7275  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7276  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7277  }
7278  num_teams = __kmp_teams_max_nth;
7279  }
7280  // Set number of teams (number of threads in the outer "parallel" of the
7281  // teams)
7282  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7283 
7284  // Remember the number of threads for inner parallel regions
7285  if (num_threads == 0) {
7286  if (!TCR_4(__kmp_init_middle))
7287  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7288  num_threads = __kmp_avail_proc / num_teams;
7289  if (num_teams * num_threads > __kmp_teams_max_nth) {
7290  // adjust num_threads w/o warning as it is not user setting
7291  num_threads = __kmp_teams_max_nth / num_teams;
7292  }
7293  } else {
7294  // This thread will be the master of the league masters
7295  // Store new thread limit; old limit is saved in th_cg_roots list
7296  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7297 
7298  if (num_teams * num_threads > __kmp_teams_max_nth) {
7299  int new_threads = __kmp_teams_max_nth / num_teams;
7300  if (!__kmp_reserve_warn) { // user asked for too many threads
7301  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7302  __kmp_msg(kmp_ms_warning,
7303  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7304  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7305  }
7306  num_threads = new_threads;
7307  }
7308  }
7309  thr->th.th_teams_size.nth = num_threads;
7310 }
7311 
7312 // Set the proc_bind var to use in the following parallel region.
7313 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7314  kmp_info_t *thr = __kmp_threads[gtid];
7315  thr->th.th_set_proc_bind = proc_bind;
7316 }
7317 
7318 #endif /* OMP_40_ENABLED */
7319 
7320 /* Launch the worker threads into the microtask. */
7321 
7322 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7323  kmp_info_t *this_thr = __kmp_threads[gtid];
7324 
7325 #ifdef KMP_DEBUG
7326  int f;
7327 #endif /* KMP_DEBUG */
7328 
7329  KMP_DEBUG_ASSERT(team);
7330  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7331  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7332  KMP_MB(); /* Flush all pending memory write invalidates. */
7333 
7334  team->t.t_construct = 0; /* no single directives seen yet */
7335  team->t.t_ordered.dt.t_value =
7336  0; /* thread 0 enters the ordered section first */
7337 
7338  /* Reset the identifiers on the dispatch buffer */
7339  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7340  if (team->t.t_max_nproc > 1) {
7341  int i;
7342  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7343  team->t.t_disp_buffer[i].buffer_index = i;
7344 #if OMP_45_ENABLED
7345  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7346 #endif
7347  }
7348  } else {
7349  team->t.t_disp_buffer[0].buffer_index = 0;
7350 #if OMP_45_ENABLED
7351  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7352 #endif
7353  }
7354 
7355  KMP_MB(); /* Flush all pending memory write invalidates. */
7356  KMP_ASSERT(this_thr->th.th_team == team);
7357 
7358 #ifdef KMP_DEBUG
7359  for (f = 0; f < team->t.t_nproc; f++) {
7360  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7361  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7362  }
7363 #endif /* KMP_DEBUG */
7364 
7365  /* release the worker threads so they may begin working */
7366  __kmp_fork_barrier(gtid, 0);
7367 }
7368 
7369 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7370  kmp_info_t *this_thr = __kmp_threads[gtid];
7371 
7372  KMP_DEBUG_ASSERT(team);
7373  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7374  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7375  KMP_MB(); /* Flush all pending memory write invalidates. */
7376 
7377 /* Join barrier after fork */
7378 
7379 #ifdef KMP_DEBUG
7380  if (__kmp_threads[gtid] &&
7381  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7382  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7383  __kmp_threads[gtid]);
7384  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7385  "team->t.t_nproc=%d\n",
7386  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7387  team->t.t_nproc);
7388  __kmp_print_structure();
7389  }
7390  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7391  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7392 #endif /* KMP_DEBUG */
7393 
7394  __kmp_join_barrier(gtid); /* wait for everyone */
7395 #if OMPT_SUPPORT
7396  if (ompt_enabled.enabled &&
7397  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7398  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7399  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7400  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7401 #if OMPT_OPTIONAL
7402  void *codeptr = NULL;
7403  if (KMP_MASTER_TID(ds_tid) &&
7404  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7405  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7406  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7407 
7408  if (ompt_enabled.ompt_callback_sync_region_wait) {
7409  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7410  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7411  codeptr);
7412  }
7413  if (ompt_enabled.ompt_callback_sync_region) {
7414  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7415  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7416  codeptr);
7417  }
7418 #endif
7419  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7420  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7421  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7422  }
7423  }
7424 #endif
7425 
7426  KMP_MB(); /* Flush all pending memory write invalidates. */
7427  KMP_ASSERT(this_thr->th.th_team == team);
7428 }
7429 
7430 /* ------------------------------------------------------------------------ */
7431 
7432 #ifdef USE_LOAD_BALANCE
7433 
7434 // Return the worker threads actively spinning in the hot team, if we
7435 // are at the outermost level of parallelism. Otherwise, return 0.
7436 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7437  int i;
7438  int retval;
7439  kmp_team_t *hot_team;
7440 
7441  if (root->r.r_active) {
7442  return 0;
7443  }
7444  hot_team = root->r.r_hot_team;
7445  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7446  return hot_team->t.t_nproc - 1; // Don't count master thread
7447  }
7448 
7449  // Skip the master thread - it is accounted for elsewhere.
7450  retval = 0;
7451  for (i = 1; i < hot_team->t.t_nproc; i++) {
7452  if (hot_team->t.t_threads[i]->th.th_active) {
7453  retval++;
7454  }
7455  }
7456  return retval;
7457 }
7458 
7459 // Perform an automatic adjustment to the number of
7460 // threads used by the next parallel region.
7461 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7462  int retval;
7463  int pool_active;
7464  int hot_team_active;
7465  int team_curr_active;
7466  int system_active;
7467 
7468  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7469  set_nproc));
7470  KMP_DEBUG_ASSERT(root);
7471  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7472  ->th.th_current_task->td_icvs.dynamic == TRUE);
7473  KMP_DEBUG_ASSERT(set_nproc > 1);
7474 
7475  if (set_nproc == 1) {
7476  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7477  return 1;
7478  }
7479 
7480  // Threads that are active in the thread pool, active in the hot team for this
7481  // particular root (if we are at the outer par level), and the currently
7482  // executing thread (to become the master) are available to add to the new
7483  // team, but are currently contributing to the system load, and must be
7484  // accounted for.
7485  pool_active = __kmp_thread_pool_active_nth;
7486  hot_team_active = __kmp_active_hot_team_nproc(root);
7487  team_curr_active = pool_active + hot_team_active + 1;
7488 
7489  // Check the system load.
7490  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7491  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7492  "hot team active = %d\n",
7493  system_active, pool_active, hot_team_active));
7494 
7495  if (system_active < 0) {
7496  // There was an error reading the necessary info from /proc, so use the
7497  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7498  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7499  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7500  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7501 
7502  // Make this call behave like the thread limit algorithm.
7503  retval = __kmp_avail_proc - __kmp_nth +
7504  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7505  if (retval > set_nproc) {
7506  retval = set_nproc;
7507  }
7508  if (retval < KMP_MIN_NTH) {
7509  retval = KMP_MIN_NTH;
7510  }
7511 
7512  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7513  retval));
7514  return retval;
7515  }
7516 
7517  // There is a slight delay in the load balance algorithm in detecting new
7518  // running procs. The real system load at this instant should be at least as
7519  // large as the #active omp thread that are available to add to the team.
7520  if (system_active < team_curr_active) {
7521  system_active = team_curr_active;
7522  }
7523  retval = __kmp_avail_proc - system_active + team_curr_active;
7524  if (retval > set_nproc) {
7525  retval = set_nproc;
7526  }
7527  if (retval < KMP_MIN_NTH) {
7528  retval = KMP_MIN_NTH;
7529  }
7530 
7531  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7532  return retval;
7533 } // __kmp_load_balance_nproc()
7534 
7535 #endif /* USE_LOAD_BALANCE */
7536 
7537 /* ------------------------------------------------------------------------ */
7538 
7539 /* NOTE: this is called with the __kmp_init_lock held */
7540 void __kmp_cleanup(void) {
7541  int f;
7542 
7543  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7544 
7545  if (TCR_4(__kmp_init_parallel)) {
7546 #if KMP_HANDLE_SIGNALS
7547  __kmp_remove_signals();
7548 #endif
7549  TCW_4(__kmp_init_parallel, FALSE);
7550  }
7551 
7552  if (TCR_4(__kmp_init_middle)) {
7553 #if KMP_AFFINITY_SUPPORTED
7554  __kmp_affinity_uninitialize();
7555 #endif /* KMP_AFFINITY_SUPPORTED */
7556  __kmp_cleanup_hierarchy();
7557  TCW_4(__kmp_init_middle, FALSE);
7558  }
7559 
7560  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7561 
7562  if (__kmp_init_serial) {
7563  __kmp_runtime_destroy();
7564  __kmp_init_serial = FALSE;
7565  }
7566 
7567  __kmp_cleanup_threadprivate_caches();
7568 
7569  for (f = 0; f < __kmp_threads_capacity; f++) {
7570  if (__kmp_root[f] != NULL) {
7571  __kmp_free(__kmp_root[f]);
7572  __kmp_root[f] = NULL;
7573  }
7574  }
7575  __kmp_free(__kmp_threads);
7576  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7577  // there is no need in freeing __kmp_root.
7578  __kmp_threads = NULL;
7579  __kmp_root = NULL;
7580  __kmp_threads_capacity = 0;
7581 
7582 #if KMP_USE_DYNAMIC_LOCK
7583  __kmp_cleanup_indirect_user_locks();
7584 #else
7585  __kmp_cleanup_user_locks();
7586 #endif
7587 
7588 #if KMP_AFFINITY_SUPPORTED
7589  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7590  __kmp_cpuinfo_file = NULL;
7591 #endif /* KMP_AFFINITY_SUPPORTED */
7592 
7593 #if KMP_USE_ADAPTIVE_LOCKS
7594 #if KMP_DEBUG_ADAPTIVE_LOCKS
7595  __kmp_print_speculative_stats();
7596 #endif
7597 #endif
7598  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7599  __kmp_nested_nth.nth = NULL;
7600  __kmp_nested_nth.size = 0;
7601  __kmp_nested_nth.used = 0;
7602  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7603  __kmp_nested_proc_bind.bind_types = NULL;
7604  __kmp_nested_proc_bind.size = 0;
7605  __kmp_nested_proc_bind.used = 0;
7606 #if OMP_50_ENABLED
7607  if (__kmp_affinity_format) {
7608  KMP_INTERNAL_FREE(__kmp_affinity_format);
7609  __kmp_affinity_format = NULL;
7610  }
7611 #endif
7612 
7613  __kmp_i18n_catclose();
7614 
7615 #if KMP_USE_HIER_SCHED
7616  __kmp_hier_scheds.deallocate();
7617 #endif
7618 
7619 #if KMP_STATS_ENABLED
7620  __kmp_stats_fini();
7621 #endif
7622 
7623  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7624 }
7625 
7626 /* ------------------------------------------------------------------------ */
7627 
7628 int __kmp_ignore_mppbeg(void) {
7629  char *env;
7630 
7631  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7632  if (__kmp_str_match_false(env))
7633  return FALSE;
7634  }
7635  // By default __kmpc_begin() is no-op.
7636  return TRUE;
7637 }
7638 
7639 int __kmp_ignore_mppend(void) {
7640  char *env;
7641 
7642  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7643  if (__kmp_str_match_false(env))
7644  return FALSE;
7645  }
7646  // By default __kmpc_end() is no-op.
7647  return TRUE;
7648 }
7649 
7650 void __kmp_internal_begin(void) {
7651  int gtid;
7652  kmp_root_t *root;
7653 
7654  /* this is a very important step as it will register new sibling threads
7655  and assign these new uber threads a new gtid */
7656  gtid = __kmp_entry_gtid();
7657  root = __kmp_threads[gtid]->th.th_root;
7658  KMP_ASSERT(KMP_UBER_GTID(gtid));
7659 
7660  if (root->r.r_begin)
7661  return;
7662  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7663  if (root->r.r_begin) {
7664  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7665  return;
7666  }
7667 
7668  root->r.r_begin = TRUE;
7669 
7670  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7671 }
7672 
7673 /* ------------------------------------------------------------------------ */
7674 
7675 void __kmp_user_set_library(enum library_type arg) {
7676  int gtid;
7677  kmp_root_t *root;
7678  kmp_info_t *thread;
7679 
7680  /* first, make sure we are initialized so we can get our gtid */
7681 
7682  gtid = __kmp_entry_gtid();
7683  thread = __kmp_threads[gtid];
7684 
7685  root = thread->th.th_root;
7686 
7687  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7688  library_serial));
7689  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7690  thread */
7691  KMP_WARNING(SetLibraryIncorrectCall);
7692  return;
7693  }
7694 
7695  switch (arg) {
7696  case library_serial:
7697  thread->th.th_set_nproc = 0;
7698  set__nproc(thread, 1);
7699  break;
7700  case library_turnaround:
7701  thread->th.th_set_nproc = 0;
7702  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7703  : __kmp_dflt_team_nth_ub);
7704  break;
7705  case library_throughput:
7706  thread->th.th_set_nproc = 0;
7707  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7708  : __kmp_dflt_team_nth_ub);
7709  break;
7710  default:
7711  KMP_FATAL(UnknownLibraryType, arg);
7712  }
7713 
7714  __kmp_aux_set_library(arg);
7715 }
7716 
7717 void __kmp_aux_set_stacksize(size_t arg) {
7718  if (!__kmp_init_serial)
7719  __kmp_serial_initialize();
7720 
7721 #if KMP_OS_DARWIN
7722  if (arg & (0x1000 - 1)) {
7723  arg &= ~(0x1000 - 1);
7724  if (arg + 0x1000) /* check for overflow if we round up */
7725  arg += 0x1000;
7726  }
7727 #endif
7728  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7729 
7730  /* only change the default stacksize before the first parallel region */
7731  if (!TCR_4(__kmp_init_parallel)) {
7732  size_t value = arg; /* argument is in bytes */
7733 
7734  if (value < __kmp_sys_min_stksize)
7735  value = __kmp_sys_min_stksize;
7736  else if (value > KMP_MAX_STKSIZE)
7737  value = KMP_MAX_STKSIZE;
7738 
7739  __kmp_stksize = value;
7740 
7741  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7742  }
7743 
7744  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7745 }
7746 
7747 /* set the behaviour of the runtime library */
7748 /* TODO this can cause some odd behaviour with sibling parallelism... */
7749 void __kmp_aux_set_library(enum library_type arg) {
7750  __kmp_library = arg;
7751 
7752  switch (__kmp_library) {
7753  case library_serial: {
7754  KMP_INFORM(LibraryIsSerial);
7755  } break;
7756  case library_turnaround:
7757  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7758  __kmp_use_yield = 2; // only yield when oversubscribed
7759  break;
7760  case library_throughput:
7761  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7762  __kmp_dflt_blocktime = 200;
7763  break;
7764  default:
7765  KMP_FATAL(UnknownLibraryType, arg);
7766  }
7767 }
7768 
7769 /* Getting team information common for all team API */
7770 // Returns NULL if not in teams construct
7771 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7772  kmp_info_t *thr = __kmp_entry_thread();
7773  teams_serialized = 0;
7774  if (thr->th.th_teams_microtask) {
7775  kmp_team_t *team = thr->th.th_team;
7776  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7777  int ii = team->t.t_level;
7778  teams_serialized = team->t.t_serialized;
7779  int level = tlevel + 1;
7780  KMP_DEBUG_ASSERT(ii >= tlevel);
7781  while (ii > level) {
7782  for (teams_serialized = team->t.t_serialized;
7783  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7784  }
7785  if (team->t.t_serialized && (!teams_serialized)) {
7786  team = team->t.t_parent;
7787  continue;
7788  }
7789  if (ii > level) {
7790  team = team->t.t_parent;
7791  ii--;
7792  }
7793  }
7794  return team;
7795  }
7796  return NULL;
7797 }
7798 
7799 int __kmp_aux_get_team_num() {
7800  int serialized;
7801  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7802  if (team) {
7803  if (serialized > 1) {
7804  return 0; // teams region is serialized ( 1 team of 1 thread ).
7805  } else {
7806  return team->t.t_master_tid;
7807  }
7808  }
7809  return 0;
7810 }
7811 
7812 int __kmp_aux_get_num_teams() {
7813  int serialized;
7814  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7815  if (team) {
7816  if (serialized > 1) {
7817  return 1;
7818  } else {
7819  return team->t.t_parent->t.t_nproc;
7820  }
7821  }
7822  return 1;
7823 }
7824 
7825 /* ------------------------------------------------------------------------ */
7826 
7827 #if OMP_50_ENABLED
7828 /*
7829  * Affinity Format Parser
7830  *
7831  * Field is in form of: %[[[0].]size]type
7832  * % and type are required (%% means print a literal '%')
7833  * type is either single char or long name surrounded by {},
7834  * e.g., N or {num_threads}
7835  * 0 => leading zeros
7836  * . => right justified when size is specified
7837  * by default output is left justified
7838  * size is the *minimum* field length
7839  * All other characters are printed as is
7840  *
7841  * Available field types:
7842  * L {thread_level} - omp_get_level()
7843  * n {thread_num} - omp_get_thread_num()
7844  * h {host} - name of host machine
7845  * P {process_id} - process id (integer)
7846  * T {thread_identifier} - native thread identifier (integer)
7847  * N {num_threads} - omp_get_num_threads()
7848  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7849  * a {thread_affinity} - comma separated list of integers or integer ranges
7850  * (values of affinity mask)
7851  *
7852  * Implementation-specific field types can be added
7853  * If a type is unknown, print "undefined"
7854 */
7855 
7856 // Structure holding the short name, long name, and corresponding data type
7857 // for snprintf. A table of these will represent the entire valid keyword
7858 // field types.
7859 typedef struct kmp_affinity_format_field_t {
7860  char short_name; // from spec e.g., L -> thread level
7861  const char *long_name; // from spec thread_level -> thread level
7862  char field_format; // data type for snprintf (typically 'd' or 's'
7863  // for integer or string)
7864 } kmp_affinity_format_field_t;
7865 
7866 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7867 #if KMP_AFFINITY_SUPPORTED
7868  {'A', "thread_affinity", 's'},
7869 #endif
7870  {'t', "team_num", 'd'},
7871  {'T', "num_teams", 'd'},
7872  {'L', "nesting_level", 'd'},
7873  {'n', "thread_num", 'd'},
7874  {'N', "num_threads", 'd'},
7875  {'a', "ancestor_tnum", 'd'},
7876  {'H', "host", 's'},
7877  {'P', "process_id", 'd'},
7878  {'i', "native_thread_id", 'd'}};
7879 
7880 // Return the number of characters it takes to hold field
7881 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7882  const char **ptr,
7883  kmp_str_buf_t *field_buffer) {
7884  int rc, format_index, field_value;
7885  const char *width_left, *width_right;
7886  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7887  static const int FORMAT_SIZE = 20;
7888  char format[FORMAT_SIZE] = {0};
7889  char absolute_short_name = 0;
7890 
7891  KMP_DEBUG_ASSERT(gtid >= 0);
7892  KMP_DEBUG_ASSERT(th);
7893  KMP_DEBUG_ASSERT(**ptr == '%');
7894  KMP_DEBUG_ASSERT(field_buffer);
7895 
7896  __kmp_str_buf_clear(field_buffer);
7897 
7898  // Skip the initial %
7899  (*ptr)++;
7900 
7901  // Check for %% first
7902  if (**ptr == '%') {
7903  __kmp_str_buf_cat(field_buffer, "%", 1);
7904  (*ptr)++; // skip over the second %
7905  return 1;
7906  }
7907 
7908  // Parse field modifiers if they are present
7909  pad_zeros = false;
7910  if (**ptr == '0') {
7911  pad_zeros = true;
7912  (*ptr)++; // skip over 0
7913  }
7914  right_justify = false;
7915  if (**ptr == '.') {
7916  right_justify = true;
7917  (*ptr)++; // skip over .
7918  }
7919  // Parse width of field: [width_left, width_right)
7920  width_left = width_right = NULL;
7921  if (**ptr >= '0' && **ptr <= '9') {
7922  width_left = *ptr;
7923  SKIP_DIGITS(*ptr);
7924  width_right = *ptr;
7925  }
7926 
7927  // Create the format for KMP_SNPRINTF based on flags parsed above
7928  format_index = 0;
7929  format[format_index++] = '%';
7930  if (!right_justify)
7931  format[format_index++] = '-';
7932  if (pad_zeros)
7933  format[format_index++] = '0';
7934  if (width_left && width_right) {
7935  int i = 0;
7936  // Only allow 8 digit number widths.
7937  // This also prevents overflowing format variable
7938  while (i < 8 && width_left < width_right) {
7939  format[format_index++] = *width_left;
7940  width_left++;
7941  i++;
7942  }
7943  }
7944 
7945  // Parse a name (long or short)
7946  // Canonicalize the name into absolute_short_name
7947  found_valid_name = false;
7948  parse_long_name = (**ptr == '{');
7949  if (parse_long_name)
7950  (*ptr)++; // skip initial left brace
7951  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7952  sizeof(__kmp_affinity_format_table[0]);
7953  ++i) {
7954  char short_name = __kmp_affinity_format_table[i].short_name;
7955  const char *long_name = __kmp_affinity_format_table[i].long_name;
7956  char field_format = __kmp_affinity_format_table[i].field_format;
7957  if (parse_long_name) {
7958  int length = KMP_STRLEN(long_name);
7959  if (strncmp(*ptr, long_name, length) == 0) {
7960  found_valid_name = true;
7961  (*ptr) += length; // skip the long name
7962  }
7963  } else if (**ptr == short_name) {
7964  found_valid_name = true;
7965  (*ptr)++; // skip the short name
7966  }
7967  if (found_valid_name) {
7968  format[format_index++] = field_format;
7969  format[format_index++] = '\0';
7970  absolute_short_name = short_name;
7971  break;
7972  }
7973  }
7974  if (parse_long_name) {
7975  if (**ptr != '}') {
7976  absolute_short_name = 0;
7977  } else {
7978  (*ptr)++; // skip over the right brace
7979  }
7980  }
7981 
7982  // Attempt to fill the buffer with the requested
7983  // value using snprintf within __kmp_str_buf_print()
7984  switch (absolute_short_name) {
7985  case 't':
7986  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7987  break;
7988  case 'T':
7989  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7990  break;
7991  case 'L':
7992  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7993  break;
7994  case 'n':
7995  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7996  break;
7997  case 'H': {
7998  static const int BUFFER_SIZE = 256;
7999  char buf[BUFFER_SIZE];
8000  __kmp_expand_host_name(buf, BUFFER_SIZE);
8001  rc = __kmp_str_buf_print(field_buffer, format, buf);
8002  } break;
8003  case 'P':
8004  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8005  break;
8006  case 'i':
8007  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8008  break;
8009  case 'N':
8010  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8011  break;
8012  case 'a':
8013  field_value =
8014  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8015  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8016  break;
8017 #if KMP_AFFINITY_SUPPORTED
8018  case 'A': {
8019  kmp_str_buf_t buf;
8020  __kmp_str_buf_init(&buf);
8021  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8022  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8023  __kmp_str_buf_free(&buf);
8024  } break;
8025 #endif
8026  default:
8027  // According to spec, If an implementation does not have info for field
8028  // type, then "undefined" is printed
8029  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8030  // Skip the field
8031  if (parse_long_name) {
8032  SKIP_TOKEN(*ptr);
8033  if (**ptr == '}')
8034  (*ptr)++;
8035  } else {
8036  (*ptr)++;
8037  }
8038  }
8039 
8040  KMP_ASSERT(format_index <= FORMAT_SIZE);
8041  return rc;
8042 }
8043 
8044 /*
8045  * Return number of characters needed to hold the affinity string
8046  * (not including null byte character)
8047  * The resultant string is printed to buffer, which the caller can then
8048  * handle afterwards
8049 */
8050 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8051  kmp_str_buf_t *buffer) {
8052  const char *parse_ptr;
8053  size_t retval;
8054  const kmp_info_t *th;
8055  kmp_str_buf_t field;
8056 
8057  KMP_DEBUG_ASSERT(buffer);
8058  KMP_DEBUG_ASSERT(gtid >= 0);
8059 
8060  __kmp_str_buf_init(&field);
8061  __kmp_str_buf_clear(buffer);
8062 
8063  th = __kmp_threads[gtid];
8064  retval = 0;
8065 
8066  // If format is NULL or zero-length string, then we use
8067  // affinity-format-var ICV
8068  parse_ptr = format;
8069  if (parse_ptr == NULL || *parse_ptr == '\0') {
8070  parse_ptr = __kmp_affinity_format;
8071  }
8072  KMP_DEBUG_ASSERT(parse_ptr);
8073 
8074  while (*parse_ptr != '\0') {
8075  // Parse a field
8076  if (*parse_ptr == '%') {
8077  // Put field in the buffer
8078  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8079  __kmp_str_buf_catbuf(buffer, &field);
8080  retval += rc;
8081  } else {
8082  // Put literal character in buffer
8083  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8084  retval++;
8085  parse_ptr++;
8086  }
8087  }
8088  __kmp_str_buf_free(&field);
8089  return retval;
8090 }
8091 
8092 // Displays the affinity string to stdout
8093 void __kmp_aux_display_affinity(int gtid, const char *format) {
8094  kmp_str_buf_t buf;
8095  __kmp_str_buf_init(&buf);
8096  __kmp_aux_capture_affinity(gtid, format, &buf);
8097  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8098  __kmp_str_buf_free(&buf);
8099 }
8100 #endif // OMP_50_ENABLED
8101 
8102 /* ------------------------------------------------------------------------ */
8103 
8104 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8105  int blocktime = arg; /* argument is in milliseconds */
8106 #if KMP_USE_MONITOR
8107  int bt_intervals;
8108 #endif
8109  int bt_set;
8110 
8111  __kmp_save_internal_controls(thread);
8112 
8113  /* Normalize and set blocktime for the teams */
8114  if (blocktime < KMP_MIN_BLOCKTIME)
8115  blocktime = KMP_MIN_BLOCKTIME;
8116  else if (blocktime > KMP_MAX_BLOCKTIME)
8117  blocktime = KMP_MAX_BLOCKTIME;
8118 
8119  set__blocktime_team(thread->th.th_team, tid, blocktime);
8120  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8121 
8122 #if KMP_USE_MONITOR
8123  /* Calculate and set blocktime intervals for the teams */
8124  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8125 
8126  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8127  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8128 #endif
8129 
8130  /* Set whether blocktime has been set to "TRUE" */
8131  bt_set = TRUE;
8132 
8133  set__bt_set_team(thread->th.th_team, tid, bt_set);
8134  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8135 #if KMP_USE_MONITOR
8136  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8137  "bt_intervals=%d, monitor_updates=%d\n",
8138  __kmp_gtid_from_tid(tid, thread->th.th_team),
8139  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8140  __kmp_monitor_wakeups));
8141 #else
8142  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8143  __kmp_gtid_from_tid(tid, thread->th.th_team),
8144  thread->th.th_team->t.t_id, tid, blocktime));
8145 #endif
8146 }
8147 
8148 void __kmp_aux_set_defaults(char const *str, int len) {
8149  if (!__kmp_init_serial) {
8150  __kmp_serial_initialize();
8151  }
8152  __kmp_env_initialize(str);
8153 
8154  if (__kmp_settings
8155 #if OMP_40_ENABLED
8156  || __kmp_display_env || __kmp_display_env_verbose
8157 #endif // OMP_40_ENABLED
8158  ) {
8159  __kmp_env_print();
8160  }
8161 } // __kmp_aux_set_defaults
8162 
8163 /* ------------------------------------------------------------------------ */
8164 /* internal fast reduction routines */
8165 
8166 PACKED_REDUCTION_METHOD_T
8167 __kmp_determine_reduction_method(
8168  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8169  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8170  kmp_critical_name *lck) {
8171 
8172  // Default reduction method: critical construct ( lck != NULL, like in current
8173  // PAROPT )
8174  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8175  // can be selected by RTL
8176  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8177  // can be selected by RTL
8178  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8179  // among generated by PAROPT.
8180 
8181  PACKED_REDUCTION_METHOD_T retval;
8182 
8183  int team_size;
8184 
8185  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8186  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8187 
8188 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8189  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8190 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8191 
8192  retval = critical_reduce_block;
8193 
8194  // another choice of getting a team size (with 1 dynamic deference) is slower
8195  team_size = __kmp_get_team_num_threads(global_tid);
8196  if (team_size == 1) {
8197 
8198  retval = empty_reduce_block;
8199 
8200  } else {
8201 
8202  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8203 
8204 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8205 
8206 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8207  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_KFREEBSD
8208 
8209  int teamsize_cutoff = 4;
8210 
8211 #if KMP_MIC_SUPPORTED
8212  if (__kmp_mic_type != non_mic) {
8213  teamsize_cutoff = 8;
8214  }
8215 #endif
8216  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8217  if (tree_available) {
8218  if (team_size <= teamsize_cutoff) {
8219  if (atomic_available) {
8220  retval = atomic_reduce_block;
8221  }
8222  } else {
8223  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8224  }
8225  } else if (atomic_available) {
8226  retval = atomic_reduce_block;
8227  }
8228 #else
8229 #error "Unknown or unsupported OS"
8230 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8231  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8232 
8233 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8234 
8235 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_KFREEBSD
8236 
8237  // basic tuning
8238 
8239  if (atomic_available) {
8240  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8241  retval = atomic_reduce_block;
8242  }
8243  } // otherwise: use critical section
8244 
8245 #elif KMP_OS_DARWIN
8246 
8247  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8248  if (atomic_available && (num_vars <= 3)) {
8249  retval = atomic_reduce_block;
8250  } else if (tree_available) {
8251  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8252  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8253  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8254  }
8255  } // otherwise: use critical section
8256 
8257 #else
8258 #error "Unknown or unsupported OS"
8259 #endif
8260 
8261 #else
8262 #error "Unknown or unsupported architecture"
8263 #endif
8264  }
8265 
8266  // KMP_FORCE_REDUCTION
8267 
8268  // If the team is serialized (team_size == 1), ignore the forced reduction
8269  // method and stay with the unsynchronized method (empty_reduce_block)
8270  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8271  team_size != 1) {
8272 
8273  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8274 
8275  int atomic_available, tree_available;
8276 
8277  switch ((forced_retval = __kmp_force_reduction_method)) {
8278  case critical_reduce_block:
8279  KMP_ASSERT(lck); // lck should be != 0
8280  break;
8281 
8282  case atomic_reduce_block:
8283  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8284  if (!atomic_available) {
8285  KMP_WARNING(RedMethodNotSupported, "atomic");
8286  forced_retval = critical_reduce_block;
8287  }
8288  break;
8289 
8290  case tree_reduce_block:
8291  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8292  if (!tree_available) {
8293  KMP_WARNING(RedMethodNotSupported, "tree");
8294  forced_retval = critical_reduce_block;
8295  } else {
8296 #if KMP_FAST_REDUCTION_BARRIER
8297  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8298 #endif
8299  }
8300  break;
8301 
8302  default:
8303  KMP_ASSERT(0); // "unsupported method specified"
8304  }
8305 
8306  retval = forced_retval;
8307  }
8308 
8309  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8310 
8311 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8312 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8313 
8314  return (retval);
8315 }
8316 
8317 // this function is for testing set/get/determine reduce method
8318 kmp_int32 __kmp_get_reduce_method(void) {
8319  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8320 }
8321 
8322 #if OMP_50_ENABLED
8323 
8324 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8325 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8326 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8327 
8328 // Hard pause shuts down the runtime completely. Resume happens naturally when
8329 // OpenMP is used subsequently.
8330 void __kmp_hard_pause() {
8331  __kmp_pause_status = kmp_hard_paused;
8332  __kmp_internal_end_thread(-1);
8333 }
8334 
8335 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8336 void __kmp_resume_if_soft_paused() {
8337  if (__kmp_pause_status == kmp_soft_paused) {
8338  __kmp_pause_status = kmp_not_paused;
8339 
8340  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8341  kmp_info_t *thread = __kmp_threads[gtid];
8342  if (thread) { // Wake it if sleeping
8343  kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8344  if (fl.is_sleeping())
8345  fl.resume(gtid);
8346  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8347  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8348  } else { // thread holds the lock and may sleep soon
8349  do { // until either the thread sleeps, or we can get the lock
8350  if (fl.is_sleeping()) {
8351  fl.resume(gtid);
8352  break;
8353  } else if (__kmp_try_suspend_mx(thread)) {
8354  __kmp_unlock_suspend_mx(thread);
8355  break;
8356  }
8357  } while (1);
8358  }
8359  }
8360  }
8361  }
8362 }
8363 
8364 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8365 // TODO: add warning messages
8366 int __kmp_pause_resource(kmp_pause_status_t level) {
8367  if (level == kmp_not_paused) { // requesting resume
8368  if (__kmp_pause_status == kmp_not_paused) {
8369  // error message about runtime not being paused, so can't resume
8370  return 1;
8371  } else {
8372  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8373  __kmp_pause_status == kmp_hard_paused);
8374  __kmp_pause_status = kmp_not_paused;
8375  return 0;
8376  }
8377  } else if (level == kmp_soft_paused) { // requesting soft pause
8378  if (__kmp_pause_status != kmp_not_paused) {
8379  // error message about already being paused
8380  return 1;
8381  } else {
8382  __kmp_soft_pause();
8383  return 0;
8384  }
8385  } else if (level == kmp_hard_paused) { // requesting hard pause
8386  if (__kmp_pause_status != kmp_not_paused) {
8387  // error message about already being paused
8388  return 1;
8389  } else {
8390  __kmp_hard_pause();
8391  return 0;
8392  }
8393  } else {
8394  // error message about invalid level
8395  return 1;
8396  }
8397 }
8398 
8399 #endif // OMP_50_ENABLED
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:929
sched_type
Definition: kmp.h:336
Definition: kmp.h:223
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
kmp_int32 flags
Definition: kmp.h:225