21 #include "kmp_error.h" 24 #include "kmp_stats.h" 26 #if KMP_USE_X87CONTROL 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 36 #include "ompt-specific.h" 42 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
82 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
83 dispatch_private_info_template<T> *pr,
85 typename traits_t<T>::signed_t st,
87 kmp_uint64 *cur_chunk,
89 typename traits_t<T>::signed_t chunk,
91 typedef typename traits_t<T>::unsigned_t UT;
92 typedef typename traits_t<T>::floating_t DBL;
100 typedef typename traits_t<T>::signed_t ST;
104 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called " 105 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 106 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
107 traits_t<T>::spec, traits_t<T>::spec,
108 traits_t<ST>::spec, traits_t<ST>::spec,
109 traits_t<T>::spec, traits_t<T>::spec);
110 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
111 __kmp_str_free(&buff);
115 th = __kmp_threads[gtid];
116 team = th->th.th_team;
117 active = !team->t.t_serialized;
120 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
121 __kmp_forkjoin_frames_mode == 3 &&
122 KMP_MASTER_GTID(gtid) &&
124 th->th.th_teams_microtask == NULL &&
126 team->t.t_active_level == 1;
128 #if (KMP_STATIC_STEAL_ENABLED) 129 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
131 schedule = kmp_sch_static_steal;
134 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
138 pr->flags.nomerge = TRUE;
142 pr->flags.nomerge = FALSE;
144 pr->type_size = traits_t<T>::type_size;
146 pr->flags.ordered = TRUE;
150 pr->flags.ordered = FALSE;
154 schedule = __kmp_static;
156 if (schedule == kmp_sch_runtime) {
159 schedule = team->t.t_sched.r_sched_type;
163 schedule = __kmp_guided;
165 schedule = __kmp_static;
169 chunk = team->t.t_sched.chunk;
178 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: " 179 "schedule:%%d chunk:%%%s\n",
181 KD_TRACE(10, (buff, gtid, schedule, chunk));
182 __kmp_str_free(&buff);
187 schedule = __kmp_guided;
190 chunk = KMP_DEFAULT_CHUNK;
196 schedule = __kmp_auto;
201 buff = __kmp_str_format(
202 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 203 "schedule:%%d chunk:%%%s\n",
205 KD_TRACE(10, (buff, gtid, schedule, chunk));
206 __kmp_str_free(&buff);
212 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
213 schedule = kmp_sch_guided_iterative_chunked;
214 KMP_WARNING(DispatchManyThreads);
217 if (schedule == kmp_sch_runtime_simd) {
219 schedule = team->t.t_sched.r_sched_type;
223 schedule == __kmp_static) {
224 schedule = kmp_sch_static_balanced_chunked;
227 schedule = kmp_sch_guided_simd;
229 chunk = team->t.t_sched.chunk * chunk;
239 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d new: schedule:%%d" 242 KD_TRACE(10, (buff, gtid, schedule, chunk));
243 __kmp_str_free(&buff);
247 #endif // OMP_45_ENABLED 248 pr->u.p.parm1 = chunk;
251 "unknown scheduling type");
255 if (__kmp_env_consistency_check) {
257 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
258 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
272 tc = (UT)(lb - ub) / (-st) + 1;
280 tc = (UT)(ub - lb) / st + 1;
286 #if KMP_STATS_ENABLED 287 if (KMP_MASTER_GTID(gtid)) {
298 pr->u.p.last_upper = ub + st;
304 if (pr->flags.ordered) {
305 pr->ordered_bumped = 0;
306 pr->u.p.ordered_lower = 1;
307 pr->u.p.ordered_upper = 0;
312 #if (KMP_STATIC_STEAL_ENABLED) 313 case kmp_sch_static_steal: {
317 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
320 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
321 if (nproc > 1 && ntc >= nproc) {
324 T small_chunk, extras;
326 small_chunk = ntc / nproc;
327 extras = ntc % nproc;
329 init =
id * small_chunk + (
id < extras ? id : extras);
330 pr->u.p.count = init;
331 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
335 pr->u.p.parm4 = (
id + 1) % nproc;
337 if (traits_t<T>::type_size > 4) {
343 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
344 th->th.th_dispatch->th_steal_lock =
345 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
346 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
350 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 351 "kmp_sch_static_balanced\n",
353 schedule = kmp_sch_static_balanced;
360 case kmp_sch_static_balanced: {
365 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
375 pr->u.p.parm1 = (
id == tc - 1);
378 pr->u.p.parm1 = FALSE;
382 T small_chunk = tc / nproc;
383 T extras = tc % nproc;
384 init =
id * small_chunk + (
id < extras ? id : extras);
385 limit = init + small_chunk - (
id < extras ? 0 : 1);
386 pr->u.p.parm1 = (
id == nproc - 1);
392 pr->u.p.parm1 = TRUE;
396 pr->u.p.parm1 = FALSE;
402 if (itt_need_metadata_reporting)
404 *cur_chunk = limit - init + 1;
407 pr->u.p.lb = lb + init;
408 pr->u.p.ub = lb + limit;
411 T ub_tmp = lb + limit * st;
412 pr->u.p.lb = lb + init * st;
416 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
418 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
421 if (pr->flags.ordered) {
422 pr->u.p.ordered_lower = init;
423 pr->u.p.ordered_upper = limit;
428 case kmp_sch_static_balanced_chunked: {
431 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 432 " -> falling-through to static_greedy\n",
434 schedule = kmp_sch_static_greedy;
436 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
441 case kmp_sch_guided_simd:
442 #endif // OMP_45_ENABLED 443 case kmp_sch_guided_iterative_chunked: {
446 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 451 if ((2L * chunk + 1) * nproc >= tc) {
453 schedule = kmp_sch_dynamic_chunked;
456 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
457 *(
double *)&pr->u.p.parm3 =
458 guided_flt_param / nproc;
461 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 462 "kmp_sch_static_greedy\n",
464 schedule = kmp_sch_static_greedy;
468 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
474 case kmp_sch_guided_analytical_chunked: {
475 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 476 "kmp_sch_guided_analytical_chunked case\n",
480 if ((2L * chunk + 1) * nproc >= tc) {
482 schedule = kmp_sch_dynamic_chunked;
487 #if KMP_USE_X87CONTROL 497 unsigned int oldFpcw = _control87(0, 0);
498 _control87(_PC_64, _MCW_PC);
501 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
508 x = (
long double)1.0 - (
long double)0.5 / nproc;
519 ptrdiff_t natural_alignment =
520 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
524 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
529 *(DBL *)&pr->u.p.parm3 = x;
542 p = __kmp_pow<UT>(x, right);
547 }
while (p > target && right < (1 << 27));
555 while (left + 1 < right) {
556 mid = (left + right) / 2;
557 if (__kmp_pow<UT>(x, mid) > target) {
566 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
567 __kmp_pow<UT>(x, cross) <= target);
570 pr->u.p.parm2 = cross;
573 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 574 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 576 #define GUIDED_ANALYTICAL_WORKAROUND (x) 579 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
580 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
582 #if KMP_USE_X87CONTROL 584 _control87(oldFpcw, _MCW_PC);
588 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 589 "kmp_sch_static_greedy\n",
591 schedule = kmp_sch_static_greedy;
597 case kmp_sch_static_greedy:
600 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
602 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
604 case kmp_sch_static_chunked:
605 case kmp_sch_dynamic_chunked:
606 if (pr->u.p.parm1 <= 0) {
607 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
609 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 610 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
613 case kmp_sch_trapezoidal: {
616 T parm1, parm2, parm3, parm4;
618 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
624 parm2 = (tc / (2 * nproc));
634 }
else if (parm1 > parm2) {
639 parm3 = (parm2 + parm1);
640 parm3 = (2 * tc + parm3 - 1) / parm3;
648 parm4 = (parm2 - parm1) / parm4;
655 pr->u.p.parm1 = parm1;
656 pr->u.p.parm2 = parm2;
657 pr->u.p.parm3 = parm3;
658 pr->u.p.parm4 = parm4;
663 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
664 KMP_HNT(GetNewerLibrary),
669 pr->schedule = schedule;
672 #if KMP_USE_HIER_SCHED 673 template <
typename T>
674 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
675 typename traits_t<T>::signed_t st);
678 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
679 kmp_int32 ub, kmp_int32 st) {
680 __kmp_dispatch_init_hierarchy<kmp_int32>(
681 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
682 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
686 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
687 kmp_uint32 ub, kmp_int32 st) {
688 __kmp_dispatch_init_hierarchy<kmp_uint32>(
689 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
690 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
694 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
695 kmp_int64 ub, kmp_int64 st) {
696 __kmp_dispatch_init_hierarchy<kmp_int64>(
697 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
698 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
702 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
703 kmp_uint64 ub, kmp_int64 st) {
704 __kmp_dispatch_init_hierarchy<kmp_uint64>(
705 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
706 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
710 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
711 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
712 for (
int i = 0; i < num_disp_buff; ++i) {
715 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
716 &team->t.t_disp_buffer[i]);
718 sh->hier->deallocate();
719 __kmp_free(sh->hier);
727 template <
typename T>
730 T ub,
typename traits_t<T>::signed_t st,
731 typename traits_t<T>::signed_t chunk,
int push_ws) {
732 typedef typename traits_t<T>::unsigned_t UT;
737 kmp_uint32 my_buffer_index;
738 dispatch_private_info_template<T> *pr;
739 dispatch_shared_info_template<T>
volatile *sh;
741 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
742 sizeof(dispatch_private_info));
743 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
744 sizeof(dispatch_shared_info));
746 if (!TCR_4(__kmp_init_parallel))
747 __kmp_parallel_initialize();
750 __kmp_resume_if_soft_paused();
753 #if INCLUDE_SSC_MARKS 754 SSC_MARK_DISPATCH_INIT();
757 typedef typename traits_t<T>::signed_t ST;
761 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d " 762 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
763 traits_t<ST>::spec, traits_t<T>::spec,
764 traits_t<T>::spec, traits_t<ST>::spec);
765 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
766 __kmp_str_free(&buff);
770 th = __kmp_threads[gtid];
771 team = th->th.th_team;
772 active = !team->t.t_serialized;
773 th->th.th_ident = loc;
778 if (schedule == __kmp_static) {
784 #if KMP_USE_HIER_SCHED 790 my_buffer_index = th->th.th_dispatch->th_disp_index;
791 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
793 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
794 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
799 if (pr->flags.use_hier) {
801 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. " 802 "Disabling hierarchical scheduling.\n",
804 pr->flags.use_hier = FALSE;
807 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
810 if (!ordered && !pr->flags.use_hier)
811 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
813 #endif // KMP_USE_HIER_SCHED 816 kmp_uint64 cur_chunk = chunk;
817 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
818 __kmp_forkjoin_frames_mode == 3 &&
819 KMP_MASTER_GTID(gtid) &&
821 th->th.th_teams_microtask == NULL &&
823 team->t.t_active_level == 1;
826 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
827 th->th.th_dispatch->th_disp_buffer);
829 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
830 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
832 my_buffer_index = th->th.th_dispatch->th_disp_index++;
835 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
837 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
838 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
839 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
840 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
844 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
848 chunk, (T)th->th.th_team_nproc,
849 (T)th->th.th_info.ds.ds_tid);
851 if (pr->flags.ordered == 0) {
852 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
853 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
855 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
856 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
864 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 865 "sh->buffer_index:%d\n",
866 gtid, my_buffer_index, sh->buffer_index));
867 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
868 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
872 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 873 "sh->buffer_index:%d\n",
874 gtid, my_buffer_index, sh->buffer_index));
876 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
877 th->th.th_dispatch->th_dispatch_sh_current =
878 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
880 if (pr->flags.ordered) {
881 __kmp_itt_ordered_init(gtid);
884 if (itt_need_metadata_reporting) {
886 kmp_uint64 schedtype = 0;
888 case kmp_sch_static_chunked:
889 case kmp_sch_static_balanced:
891 case kmp_sch_static_greedy:
892 cur_chunk = pr->u.p.parm1;
894 case kmp_sch_dynamic_chunked:
897 case kmp_sch_guided_iterative_chunked:
898 case kmp_sch_guided_analytical_chunked:
900 case kmp_sch_guided_simd:
910 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
912 #if KMP_USE_HIER_SCHED 913 if (pr->flags.use_hier) {
915 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
917 #endif // KMP_USER_HIER_SCHED 925 buff = __kmp_str_format(
926 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 928 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 929 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
930 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
931 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
932 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
933 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
934 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
935 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
936 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
937 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
938 __kmp_str_free(&buff);
941 #if (KMP_STATIC_STEAL_ENABLED) 947 if (schedule == kmp_sch_static_steal) {
951 volatile T *p = &pr->u.p.static_steal_counter;
954 #endif // ( KMP_STATIC_STEAL_ENABLED ) 956 #if OMPT_SUPPORT && OMPT_OPTIONAL 957 if (ompt_enabled.ompt_callback_work) {
958 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
959 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
960 ompt_callbacks.ompt_callback(ompt_callback_work)(
961 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
962 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
965 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
973 template <
typename UT>
974 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
975 typedef typename traits_t<UT>::signed_t ST;
976 kmp_info_t *th = __kmp_threads[gtid];
978 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
979 if (!th->th.th_team->t.t_serialized) {
981 dispatch_private_info_template<UT> *pr =
982 reinterpret_cast<dispatch_private_info_template<UT> *
>(
983 th->th.th_dispatch->th_dispatch_pr_current);
984 dispatch_shared_info_template<UT>
volatile *sh =
985 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
986 th->th.th_dispatch->th_dispatch_sh_current);
987 KMP_DEBUG_ASSERT(pr);
988 KMP_DEBUG_ASSERT(sh);
989 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
990 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
992 if (pr->ordered_bumped) {
995 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
997 pr->ordered_bumped = 0;
999 UT lower = pr->u.p.ordered_lower;
1005 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: " 1006 "ordered_iteration:%%%s lower:%%%s\n",
1007 traits_t<UT>::spec, traits_t<UT>::spec);
1008 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1009 __kmp_str_free(&buff);
1013 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1014 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1020 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: " 1021 "ordered_iteration:%%%s lower:%%%s\n",
1022 traits_t<UT>::spec, traits_t<UT>::spec);
1023 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1024 __kmp_str_free(&buff);
1028 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1031 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1034 #ifdef KMP_GOMP_COMPAT 1036 template <
typename UT>
1037 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1038 typedef typename traits_t<UT>::signed_t ST;
1039 kmp_info_t *th = __kmp_threads[gtid];
1041 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1042 if (!th->th.th_team->t.t_serialized) {
1044 dispatch_private_info_template<UT> *pr =
1045 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1046 th->th.th_dispatch->th_dispatch_pr_current);
1047 dispatch_shared_info_template<UT>
volatile *sh =
1048 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1049 th->th.th_dispatch->th_dispatch_sh_current);
1050 KMP_DEBUG_ASSERT(pr);
1051 KMP_DEBUG_ASSERT(sh);
1052 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1053 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1056 UT lower = pr->u.p.ordered_lower;
1057 UT upper = pr->u.p.ordered_upper;
1058 UT inc = upper - lower + 1;
1060 if (pr->ordered_bumped == inc) {
1063 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1065 pr->ordered_bumped = 0;
1067 inc -= pr->ordered_bumped;
1073 buff = __kmp_str_format(
1074 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1075 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1076 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1077 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1078 __kmp_str_free(&buff);
1082 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1083 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1086 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting " 1087 "ordered_bumped to zero\n",
1089 pr->ordered_bumped = 0;
1095 buff = __kmp_str_format(
1096 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1097 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1098 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1099 traits_t<UT>::spec);
1101 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1102 __kmp_str_free(&buff);
1106 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1110 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1115 template <
typename T>
1116 int __kmp_dispatch_next_algorithm(
int gtid,
1117 dispatch_private_info_template<T> *pr,
1118 dispatch_shared_info_template<T>
volatile *sh,
1119 kmp_int32 *p_last, T *p_lb, T *p_ub,
1120 typename traits_t<T>::signed_t *p_st, T nproc,
1122 typedef typename traits_t<T>::unsigned_t UT;
1123 typedef typename traits_t<T>::signed_t ST;
1124 typedef typename traits_t<T>::floating_t DBL;
1129 UT limit, trip, init;
1130 kmp_info_t *th = __kmp_threads[gtid];
1131 kmp_team_t *team = th->th.th_team;
1133 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1134 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1135 KMP_DEBUG_ASSERT(pr);
1136 KMP_DEBUG_ASSERT(sh);
1137 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1143 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1144 "sh:%%p nproc:%%%s tid:%%%s\n",
1145 traits_t<T>::spec, traits_t<T>::spec);
1146 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1147 __kmp_str_free(&buff);
1152 if (pr->u.p.tc == 0) {
1154 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1160 switch (pr->schedule) {
1161 #if (KMP_STATIC_STEAL_ENABLED) 1162 case kmp_sch_static_steal: {
1163 T chunk = pr->u.p.parm1;
1166 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1169 trip = pr->u.p.tc - 1;
1171 if (traits_t<T>::type_size > 4) {
1174 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1175 KMP_DEBUG_ASSERT(lck != NULL);
1176 if (pr->u.p.count < (UT)pr->u.p.ub) {
1177 __kmp_acquire_lock(lck, gtid);
1179 init = (pr->u.p.count)++;
1180 status = (init < (UT)pr->u.p.ub);
1181 __kmp_release_lock(lck, gtid);
1186 kmp_info_t **other_threads = team->t.t_threads;
1187 int while_limit = nproc;
1188 int while_index = 0;
1191 while ((!status) && (while_limit != ++while_index)) {
1193 T victimIdx = pr->u.p.parm4;
1194 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1195 dispatch_private_info_template<T> *victim =
1196 reinterpret_cast<dispatch_private_info_template<T> *
>(
1197 other_threads[victimIdx]
1198 ->th.th_dispatch->th_dispatch_pr_current);
1199 while ((victim == NULL || victim == pr ||
1200 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1201 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1202 oldVictimIdx != victimIdx) {
1203 victimIdx = (victimIdx + 1) % nproc;
1204 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1205 other_threads[victimIdx]
1206 ->th.th_dispatch->th_dispatch_pr_current);
1208 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1209 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1214 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1215 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1219 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1220 KMP_ASSERT(lck != NULL);
1221 __kmp_acquire_lock(lck, gtid);
1222 limit = victim->u.p.ub;
1223 if (victim->u.p.count >= limit ||
1224 (remaining = limit - victim->u.p.count) < 2) {
1225 __kmp_release_lock(lck, gtid);
1226 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1231 if (remaining > 3) {
1233 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1234 init = (victim->u.p.ub -= (remaining >> 2));
1237 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1238 init = (victim->u.p.ub -= 1);
1240 __kmp_release_lock(lck, gtid);
1242 KMP_DEBUG_ASSERT(init + 1 <= limit);
1243 pr->u.p.parm4 = victimIdx;
1247 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1248 pr->u.p.count = init + 1;
1250 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1265 union_i4 vold, vnew;
1266 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1269 while (!KMP_COMPARE_AND_STORE_ACQ64(
1270 (
volatile kmp_int64 *)&pr->u.p.count,
1271 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1272 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1274 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1279 init = vnew.p.count;
1280 status = (init < (UT)vnew.p.ub);
1284 kmp_info_t **other_threads = team->t.t_threads;
1285 int while_limit = nproc;
1286 int while_index = 0;
1290 while ((!status) && (while_limit != ++while_index)) {
1291 union_i4 vold, vnew;
1292 kmp_int32 remaining;
1293 T victimIdx = pr->u.p.parm4;
1294 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1295 dispatch_private_info_template<T> *victim =
1296 reinterpret_cast<dispatch_private_info_template<T> *
>(
1297 other_threads[victimIdx]
1298 ->th.th_dispatch->th_dispatch_pr_current);
1299 while ((victim == NULL || victim == pr ||
1300 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1301 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1302 oldVictimIdx != victimIdx) {
1303 victimIdx = (victimIdx + 1) % nproc;
1304 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1305 other_threads[victimIdx]
1306 ->th.th_dispatch->th_dispatch_pr_current);
1308 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1309 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1314 pr->u.p.parm4 = victimIdx;
1316 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1319 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1320 if (vnew.p.count >= (UT)vnew.p.ub ||
1321 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1322 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1325 if (remaining > 3) {
1326 vnew.p.ub -= (remaining >> 2);
1330 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1332 if (KMP_COMPARE_AND_STORE_ACQ64(
1333 (
volatile kmp_int64 *)&victim->u.p.count,
1334 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1335 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1337 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1338 vold.p.ub - vnew.p.ub);
1343 vold.p.count = init + 1;
1345 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1347 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1362 start = pr->u.p.parm2;
1364 limit = chunk + init - 1;
1366 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1368 KMP_DEBUG_ASSERT(init <= trip);
1369 if ((last = (limit >= trip)) != 0)
1375 *p_lb = start + init;
1376 *p_ub = start + limit;
1378 *p_lb = start + init * incr;
1379 *p_ub = start + limit * incr;
1382 if (pr->flags.ordered) {
1383 pr->u.p.ordered_lower = init;
1384 pr->u.p.ordered_upper = limit;
1389 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1390 case kmp_sch_static_balanced: {
1393 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1396 if ((status = !pr->u.p.count) != 0) {
1400 last = pr->u.p.parm1;
1404 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1408 case kmp_sch_static_greedy:
1410 case kmp_sch_static_chunked: {
1413 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1414 "kmp_sch_static_[affinity|chunked] case\n",
1416 parm1 = pr->u.p.parm1;
1418 trip = pr->u.p.tc - 1;
1419 init = parm1 * (pr->u.p.count + tid);
1421 if ((status = (init <= trip)) != 0) {
1424 limit = parm1 + init - 1;
1426 if ((last = (limit >= trip)) != 0)
1432 pr->u.p.count += nproc;
1435 *p_lb = start + init;
1436 *p_ub = start + limit;
1438 *p_lb = start + init * incr;
1439 *p_ub = start + limit * incr;
1442 if (pr->flags.ordered) {
1443 pr->u.p.ordered_lower = init;
1444 pr->u.p.ordered_upper = limit;
1450 case kmp_sch_dynamic_chunked: {
1451 T chunk = pr->u.p.parm1;
1455 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1458 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1459 trip = pr->u.p.tc - 1;
1461 if ((status = (init <= trip)) == 0) {
1468 limit = chunk + init - 1;
1471 if ((last = (limit >= trip)) != 0)
1478 *p_lb = start + init;
1479 *p_ub = start + limit;
1481 *p_lb = start + init * incr;
1482 *p_ub = start + limit * incr;
1485 if (pr->flags.ordered) {
1486 pr->u.p.ordered_lower = init;
1487 pr->u.p.ordered_upper = limit;
1493 case kmp_sch_guided_iterative_chunked: {
1494 T chunkspec = pr->u.p.parm1;
1495 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1502 init = sh->u.s.iteration;
1503 remaining = trip - init;
1504 if (remaining <= 0) {
1513 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1515 remaining = trip - init;
1516 if (remaining <= 0) {
1521 if ((T)remaining > chunkspec) {
1522 limit = init + chunkspec - 1;
1525 limit = init + remaining - 1;
1531 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1532 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1533 (ST)init, (ST)limit)) {
1545 *p_lb = start + init * incr;
1546 *p_ub = start + limit * incr;
1547 if (pr->flags.ordered) {
1548 pr->u.p.ordered_lower = init;
1549 pr->u.p.ordered_upper = limit;
1561 case kmp_sch_guided_simd: {
1564 T chunk = pr->u.p.parm1;
1566 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1572 init = sh->u.s.iteration;
1573 remaining = trip - init;
1574 if (remaining <= 0) {
1578 KMP_DEBUG_ASSERT(init % chunk == 0);
1580 if ((T)remaining < pr->u.p.parm2) {
1583 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1585 remaining = trip - init;
1586 if (remaining <= 0) {
1591 if ((T)remaining > chunk) {
1592 limit = init + chunk - 1;
1595 limit = init + remaining - 1;
1601 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1602 UT rem = span % chunk;
1604 span += chunk - rem;
1605 limit = init + span;
1606 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1607 (ST)init, (ST)limit)) {
1619 *p_lb = start + init * incr;
1620 *p_ub = start + limit * incr;
1621 if (pr->flags.ordered) {
1622 pr->u.p.ordered_lower = init;
1623 pr->u.p.ordered_upper = limit;
1633 #endif // OMP_45_ENABLED 1635 case kmp_sch_guided_analytical_chunked: {
1636 T chunkspec = pr->u.p.parm1;
1638 #if KMP_USE_X87CONTROL 1641 unsigned int oldFpcw;
1642 unsigned int fpcwSet = 0;
1644 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1645 "kmp_sch_guided_analytical_chunked case\n",
1650 KMP_DEBUG_ASSERT(nproc > 1);
1651 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1655 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1656 if (chunkIdx >= (UT)pr->u.p.parm2) {
1659 init = chunkIdx * chunkspec + pr->u.p.count;
1662 if ((status = (init > 0 && init <= trip)) != 0) {
1663 limit = init + chunkspec - 1;
1665 if ((last = (limit >= trip)) != 0)
1675 #if KMP_USE_X87CONTROL 1680 oldFpcw = _control87(0, 0);
1681 _control87(_PC_64, _MCW_PC);
1686 init = __kmp_dispatch_guided_remaining<T>(
1687 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1688 KMP_DEBUG_ASSERT(init);
1692 limit = trip - __kmp_dispatch_guided_remaining<T>(
1693 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1694 KMP_ASSERT(init <= limit);
1696 KMP_DEBUG_ASSERT(limit <= trip);
1703 #if KMP_USE_X87CONTROL 1707 if (fpcwSet && (oldFpcw & fpcwSet))
1708 _control87(oldFpcw, _MCW_PC);
1715 *p_lb = start + init * incr;
1716 *p_ub = start + limit * incr;
1717 if (pr->flags.ordered) {
1718 pr->u.p.ordered_lower = init;
1719 pr->u.p.ordered_upper = limit;
1730 case kmp_sch_trapezoidal: {
1732 T parm2 = pr->u.p.parm2;
1733 T parm3 = pr->u.p.parm3;
1734 T parm4 = pr->u.p.parm4;
1736 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1739 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1741 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1742 trip = pr->u.p.tc - 1;
1744 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1751 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1754 if ((last = (limit >= trip)) != 0)
1761 *p_lb = start + init;
1762 *p_ub = start + limit;
1764 *p_lb = start + init * incr;
1765 *p_ub = start + limit * incr;
1768 if (pr->flags.ordered) {
1769 pr->u.p.ordered_lower = init;
1770 pr->u.p.ordered_upper = limit;
1777 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1778 KMP_HNT(GetNewerLibrary),
1786 if (pr->flags.ordered) {
1789 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d " 1790 "ordered_lower:%%%s ordered_upper:%%%s\n",
1791 traits_t<UT>::spec, traits_t<UT>::spec);
1792 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1793 __kmp_str_free(&buff);
1798 buff = __kmp_str_format(
1799 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1800 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1801 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1802 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1803 __kmp_str_free(&buff);
1812 #if OMPT_SUPPORT && OMPT_OPTIONAL 1813 #define OMPT_LOOP_END \ 1814 if (status == 0) { \ 1815 if (ompt_enabled.ompt_callback_work) { \ 1816 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1817 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1818 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1819 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1820 &(task_info->task_data), 0, codeptr); \ 1825 #define OMPT_LOOP_END // no-op 1828 #if KMP_STATS_ENABLED 1829 #define KMP_STATS_LOOP_END \ 1831 kmp_int64 u, l, t, i; \ 1832 l = (kmp_int64)(*p_lb); \ 1833 u = (kmp_int64)(*p_ub); \ 1834 i = (kmp_int64)(pr->u.p.st); \ 1835 if (status == 0) { \ 1837 KMP_POP_PARTITIONED_TIMER(); \ 1838 } else if (i == 1) { \ 1843 } else if (i < 0) { \ 1845 t = (l - u) / (-i) + 1; \ 1850 t = (u - l) / i + 1; \ 1854 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1857 #define KMP_STATS_LOOP_END 1860 template <
typename T>
1861 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1863 typename traits_t<T>::signed_t *p_st
1864 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1870 typedef typename traits_t<T>::unsigned_t UT;
1871 typedef typename traits_t<T>::signed_t ST;
1876 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1879 dispatch_private_info_template<T> *pr;
1880 kmp_info_t *th = __kmp_threads[gtid];
1881 kmp_team_t *team = th->th.th_team;
1883 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1886 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1887 gtid, p_lb, p_ub, p_st, p_last));
1889 if (team->t.t_serialized) {
1891 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1892 th->th.th_dispatch->th_disp_buffer);
1893 KMP_DEBUG_ASSERT(pr);
1895 if ((status = (pr->u.p.tc != 0)) == 0) {
1902 if (__kmp_env_consistency_check) {
1903 if (pr->pushed_ws != ct_none) {
1904 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1907 }
else if (pr->flags.nomerge) {
1910 UT limit, trip, init;
1912 T chunk = pr->u.p.parm1;
1914 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1917 init = chunk * pr->u.p.count++;
1918 trip = pr->u.p.tc - 1;
1920 if ((status = (init <= trip)) == 0) {
1927 if (__kmp_env_consistency_check) {
1928 if (pr->pushed_ws != ct_none) {
1929 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1934 limit = chunk + init - 1;
1937 if ((last = (limit >= trip)) != 0) {
1940 pr->u.p.last_upper = pr->u.p.ub;
1948 *p_lb = start + init;
1949 *p_ub = start + limit;
1951 *p_lb = start + init * incr;
1952 *p_ub = start + limit * incr;
1955 if (pr->flags.ordered) {
1956 pr->u.p.ordered_lower = init;
1957 pr->u.p.ordered_upper = limit;
1962 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d " 1963 "ordered_lower:%%%s ordered_upper:%%%s\n",
1964 traits_t<UT>::spec, traits_t<UT>::spec);
1965 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1966 pr->u.p.ordered_upper));
1967 __kmp_str_free(&buff);
1977 pr->u.p.last_upper = *p_ub;
1988 buff = __kmp_str_format(
1989 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1990 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1991 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1992 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1993 __kmp_str_free(&buff);
1996 #if INCLUDE_SSC_MARKS 1997 SSC_MARK_DISPATCH_NEXT();
2004 dispatch_shared_info_template<T>
volatile *sh;
2006 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2007 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2009 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2010 th->th.th_dispatch->th_dispatch_pr_current);
2011 KMP_DEBUG_ASSERT(pr);
2012 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2013 th->th.th_dispatch->th_dispatch_sh_current);
2014 KMP_DEBUG_ASSERT(sh);
2016 #if KMP_USE_HIER_SCHED 2017 if (pr->flags.use_hier)
2018 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2020 #endif // KMP_USE_HIER_SCHED 2021 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2022 p_st, th->th.th_team_nproc,
2023 th->th.th_info.ds.ds_tid);
2028 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2033 buff = __kmp_str_format(
2034 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2035 traits_t<UT>::spec);
2036 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2037 __kmp_str_free(&buff);
2041 #if KMP_USE_HIER_SCHED 2042 pr->flags.use_hier = FALSE;
2044 if ((ST)num_done == th->th.th_team_nproc - 1) {
2045 #if (KMP_STATIC_STEAL_ENABLED) 2046 if (pr->schedule == kmp_sch_static_steal &&
2047 traits_t<T>::type_size > 4) {
2049 kmp_info_t **other_threads = team->t.t_threads;
2051 for (i = 0; i < th->th.th_team_nproc; ++i) {
2052 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2053 KMP_ASSERT(lck != NULL);
2054 __kmp_destroy_lock(lck);
2056 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2064 sh->u.s.num_done = 0;
2065 sh->u.s.iteration = 0;
2068 if (pr->flags.ordered) {
2069 sh->u.s.ordered_iteration = 0;
2074 sh->buffer_index += __kmp_dispatch_num_buffers;
2075 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2076 gtid, sh->buffer_index));
2081 if (__kmp_env_consistency_check) {
2082 if (pr->pushed_ws != ct_none) {
2083 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2087 th->th.th_dispatch->th_deo_fcn = NULL;
2088 th->th.th_dispatch->th_dxo_fcn = NULL;
2089 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2090 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2094 pr->u.p.last_upper = pr->u.p.ub;
2097 if (p_last != NULL && status != 0)
2105 buff = __kmp_str_format(
2106 "__kmp_dispatch_next: T#%%d normal case: " 2107 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2108 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2109 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2110 (p_last ? *p_last : 0), status));
2111 __kmp_str_free(&buff);
2114 #if INCLUDE_SSC_MARKS 2115 SSC_MARK_DISPATCH_NEXT();
2122 template <
typename T>
2123 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2124 kmp_int32 *plastiter, T *plower, T *pupper,
2125 typename traits_t<T>::signed_t incr) {
2126 typedef typename traits_t<T>::unsigned_t UT;
2133 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2134 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2136 typedef typename traits_t<T>::signed_t ST;
2140 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d " 2141 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2142 traits_t<T>::spec, traits_t<T>::spec,
2143 traits_t<ST>::spec, traits_t<T>::spec);
2144 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2145 __kmp_str_free(&buff);
2149 if (__kmp_env_consistency_check) {
2151 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2154 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2164 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2167 th = __kmp_threads[gtid];
2168 team = th->th.th_team;
2170 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2171 nteams = th->th.th_teams_size.nteams;
2173 team_id = team->t.t_master_tid;
2174 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2178 trip_count = *pupper - *plower + 1;
2179 }
else if (incr == -1) {
2180 trip_count = *plower - *pupper + 1;
2181 }
else if (incr > 0) {
2183 trip_count = (UT)(*pupper - *plower) / incr + 1;
2185 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2188 if (trip_count <= nteams) {
2190 __kmp_static == kmp_sch_static_greedy ||
2192 kmp_sch_static_balanced);
2194 if (team_id < trip_count) {
2195 *pupper = *plower = *plower + team_id * incr;
2197 *plower = *pupper + incr;
2199 if (plastiter != NULL)
2200 *plastiter = (team_id == trip_count - 1);
2202 if (__kmp_static == kmp_sch_static_balanced) {
2203 UT chunk = trip_count / nteams;
2204 UT extras = trip_count % nteams;
2206 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2207 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2208 if (plastiter != NULL)
2209 *plastiter = (team_id == nteams - 1);
2212 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2214 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2216 *plower += team_id * chunk_inc_count;
2217 *pupper = *plower + chunk_inc_count - incr;
2220 if (*pupper < *plower)
2221 *pupper = traits_t<T>::max_value;
2222 if (plastiter != NULL)
2223 *plastiter = *plower <= upper && *pupper > upper - incr;
2224 if (*pupper > upper)
2227 if (*pupper > *plower)
2228 *pupper = traits_t<T>::min_value;
2229 if (plastiter != NULL)
2230 *plastiter = *plower >= upper && *pupper < upper - incr;
2231 if (*pupper < upper)
2263 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2264 KMP_DEBUG_ASSERT(__kmp_init_serial);
2265 #if OMPT_SUPPORT && OMPT_OPTIONAL 2266 OMPT_STORE_RETURN_ADDRESS(gtid);
2268 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2275 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2276 KMP_DEBUG_ASSERT(__kmp_init_serial);
2277 #if OMPT_SUPPORT && OMPT_OPTIONAL 2278 OMPT_STORE_RETURN_ADDRESS(gtid);
2280 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2288 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2289 KMP_DEBUG_ASSERT(__kmp_init_serial);
2290 #if OMPT_SUPPORT && OMPT_OPTIONAL 2291 OMPT_STORE_RETURN_ADDRESS(gtid);
2293 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2301 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2302 KMP_DEBUG_ASSERT(__kmp_init_serial);
2303 #if OMPT_SUPPORT && OMPT_OPTIONAL 2304 OMPT_STORE_RETURN_ADDRESS(gtid);
2306 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2320 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2322 KMP_DEBUG_ASSERT(__kmp_init_serial);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL 2324 OMPT_STORE_RETURN_ADDRESS(gtid);
2326 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2327 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2330 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2332 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2334 KMP_DEBUG_ASSERT(__kmp_init_serial);
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL 2336 OMPT_STORE_RETURN_ADDRESS(gtid);
2338 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2339 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2342 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2344 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2346 KMP_DEBUG_ASSERT(__kmp_init_serial);
2347 #if OMPT_SUPPORT && OMPT_OPTIONAL 2348 OMPT_STORE_RETURN_ADDRESS(gtid);
2350 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2351 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2354 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2356 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2358 KMP_DEBUG_ASSERT(__kmp_init_serial);
2359 #if OMPT_SUPPORT && OMPT_OPTIONAL 2360 OMPT_STORE_RETURN_ADDRESS(gtid);
2362 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2363 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2380 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2381 #if OMPT_SUPPORT && OMPT_OPTIONAL 2382 OMPT_STORE_RETURN_ADDRESS(gtid);
2384 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2385 #if OMPT_SUPPORT && OMPT_OPTIONAL 2387 OMPT_LOAD_RETURN_ADDRESS(gtid)
2396 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2398 #if OMPT_SUPPORT && OMPT_OPTIONAL 2399 OMPT_STORE_RETURN_ADDRESS(gtid);
2401 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2402 #if OMPT_SUPPORT && OMPT_OPTIONAL 2404 OMPT_LOAD_RETURN_ADDRESS(gtid)
2413 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2414 #if OMPT_SUPPORT && OMPT_OPTIONAL 2415 OMPT_STORE_RETURN_ADDRESS(gtid);
2417 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL 2420 OMPT_LOAD_RETURN_ADDRESS(gtid)
2429 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2431 #if OMPT_SUPPORT && OMPT_OPTIONAL 2432 OMPT_STORE_RETURN_ADDRESS(gtid);
2434 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL 2437 OMPT_LOAD_RETURN_ADDRESS(gtid)
2449 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2456 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2463 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2470 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2477 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2478 return value == checker;
2481 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2482 return value != checker;
2485 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2486 return value < checker;
2489 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2490 return value >= checker;
2493 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2494 return value <= checker;
2498 __kmp_wait_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2499 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2503 volatile kmp_uint32 *spin = spinner;
2504 kmp_uint32 check = checker;
2506 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2509 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2510 KMP_INIT_YIELD(spins);
2512 while (!f(r = TCR_4(*spin), check)) {
2513 KMP_FSYNC_SPIN_PREPARE(obj);
2518 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2520 KMP_FSYNC_SPIN_ACQUIRED(obj);
2524 void __kmp_wait_4_ptr(
void *spinner, kmp_uint32 checker,
2525 kmp_uint32 (*pred)(
void *, kmp_uint32),
2529 void *spin = spinner;
2530 kmp_uint32 check = checker;
2532 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2534 KMP_FSYNC_SPIN_INIT(obj, spin);
2535 KMP_INIT_YIELD(spins);
2537 while (!f(spin, check)) {
2538 KMP_FSYNC_SPIN_PREPARE(obj);
2541 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2543 KMP_FSYNC_SPIN_ACQUIRED(obj);
2548 #ifdef KMP_GOMP_COMPAT 2550 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2552 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2554 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2558 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2560 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2562 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2566 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2568 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2570 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2574 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2576 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2578 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2582 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2583 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2586 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2587 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2590 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2591 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2594 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2595 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)