diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py index 799e8ad228..3ea427a54f 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py @@ -434,6 +434,8 @@ def _gt_auto_process_top_level_maps( The function assumes that `gt_simplify()` has been called on the SDFG before it is passed to this function. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() # NOTE: Inside this function we have to disable the consolidation of edges. # This is because it might block the application of `SpliAccessNode`. As @@ -690,6 +692,8 @@ def _gt_auto_process_dataflow_inside_maps( over a constant range, e.g. the number of neighbours, which is known at compile time, so the compiler will fully unroll them anyway. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() # Separate Tasklets into dependent and independent parts to promote data # reusability. It is important that this step has to be performed before diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/concat_where_mapper.py b/src/gt4py/next/program_processors/runners/dace/transformations/concat_where_mapper.py index 8052426f33..4580a19823 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/concat_where_mapper.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/concat_where_mapper.py @@ -1078,7 +1078,7 @@ def write_tasklet_code(tlet_inputs, select_conds, prod_accesses): # type: ignor ) concat_where_tasklet = state.add_tasklet( tasklet_name, - inputs=set(tlet_inputs), + inputs={k: None for k in tlet_inputs}, outputs={tlet_output}, code=tlet_code, ) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/dead_dataflow_elimination.py b/src/gt4py/next/program_processors/runners/dace/transformations/dead_dataflow_elimination.py index 39dd321644..912863cf20 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/dead_dataflow_elimination.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/dead_dataflow_elimination.py @@ -39,6 +39,9 @@ def gt_eliminate_dead_dataflow( Todo: Implement a better way of applying the `DeadMemletElimination` transformation. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() + find_single_use_data = dace_analysis.FindSingleUseData() single_use_data = find_single_use_data.apply_pass(sdfg, None) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/fuse_horizontal_conditionblocks.py b/src/gt4py/next/program_processors/runners/dace/transformations/fuse_horizontal_conditionblocks.py index ef0e64d05b..701592d324 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/fuse_horizontal_conditionblocks.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/fuse_horizontal_conditionblocks.py @@ -290,7 +290,7 @@ def apply( # to the first conditional block SDFG. We don't have to add `__cond` because we know it's the same for both conditional blocks. # TODO(iomaganaris): Remove inputs to the conditional block that come from the same AccessNodes (same data) second_arrays_rename_map: dict[str, str] = {} - for data_name, data_desc in fused_conditional_block.sdfg.arrays.items(): + for data_name, data_desc in sorted(fused_conditional_block.sdfg.arrays.items()): if data_name == "__cond": continue new_data_name = gtx_transformations.utils.unique_name(data_name) + "_from_cb_fusion" @@ -303,7 +303,7 @@ def apply( # Move the connectors from the second conditional block to the first # TODO(iomaganaris): Here we copy empty memlets used for scheduling as well. This means that the first conditional blocks inherits the scheduling of the second one as well. Maybe that's not good in some cases to hide latency but for now we keep it as it is - for edge in graph.in_edges(nested_sdfg_of_fused_conditional_block): + for edge in sorted(graph.in_edges(nested_sdfg_of_fused_conditional_block), key=lambda e: str(e.dst_conn)): if edge.dst_conn == "__cond": continue nested_sdfg_of_extended_conditional_block.add_in_connector( @@ -315,7 +315,7 @@ def apply( new_dst_conn=second_arrays_rename_map[edge.dst_conn], new_dst=nested_sdfg_of_extended_conditional_block, ) - for edge in graph.out_edges(nested_sdfg_of_fused_conditional_block): + for edge in sorted(graph.out_edges(nested_sdfg_of_fused_conditional_block), key=lambda e: str(e.src_conn)): nested_sdfg_of_extended_conditional_block.add_out_connector( second_arrays_rename_map[edge.src_conn] ) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py index aa34736c8a..0ff4d8227e 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/gpu_utils.py @@ -199,6 +199,8 @@ def restrict_fusion_to_newly_created_maps_horizontal( # Now try to fuse the maps together, but restrict them that at least one map # needs to be new. # TODO(phimuell): Improve this by replacing it by an explicit loop. + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() sdfg.apply_transformations_repeated( [ gtx_transformations.MapFusionVertical( @@ -791,6 +793,8 @@ def gt_remove_trivial_gpu_maps( Todo: Improve this function. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() # First we try to promote and fuse them with other non-trivial maps. sdfg.apply_transformations_once_everywhere( @@ -828,6 +832,8 @@ def restrict_to_trivial_gpu_maps( return True # TODO(phimuell): Replace this with a more performant loop. + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() sdfg.apply_transformations_repeated( [ gtx_transformations.MapFusionVertical( diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_extended.py b/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_extended.py index ed09312d4f..520ed74ad3 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_extended.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_extended.py @@ -124,6 +124,8 @@ def gt_horizontal_map_split_fusion( validate: Perform validation during the steps. validate_all: Perform extensive validation. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() transformations = [ HorizontalSplitMapRange( @@ -216,6 +218,9 @@ def gt_vertical_map_split_fusion( - Due to a bug in the transformation, not all Maps, that were created by the splitting were fused. Especially "chains" might still be present. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() + if single_use_data is None: find_single_use_data = dace_analysis.FindSingleUseData() single_use_data = find_single_use_data.apply_pass(sdfg, None) @@ -795,4 +800,6 @@ def _restrict_fusion_to_newly_created_maps( trafo._single_use_data = self._single_use_data # This is not efficient, but it is currently the only way to run it + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() sdfg.apply_transformations_repeated(trafo, validate=False, validate_all=False) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py index 2be9bbaede..e9931dce9b 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py @@ -52,7 +52,8 @@ def _new_name(old_name: str) -> str: new_data_descriptors = {} subgraph = graph.scope_subgraph(map_entry, include_entry=True, include_exit=True) - map_nodes = subgraph.nodes() + map_nodes = sorted(subgraph.nodes(), + key=lambda n: (type(n).__name__, str(getattr(n, 'data', getattr(n, 'label', ''))))) map_edges = subgraph.edges() new_map_entry = None @@ -78,8 +79,8 @@ def _new_name(old_name: str) -> str: elif isinstance(node, dace_nodes.NestedSDFG): node_ = graph.add_nested_sdfg( sdfg=copy.deepcopy(node.sdfg), - inputs=set(node.in_connectors.keys()), - outputs=set(node.out_connectors.keys()), + inputs=dict(node.in_connectors), + outputs=dict(node.out_connectors), symbol_mapping=node.symbol_mapping.copy(), debuginfo=copy.copy(node.debuginfo), ) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/map_orderer.py b/src/gt4py/next/program_processors/runners/dace/transformations/map_orderer.py index 2f56671da8..50fecc0354 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/map_orderer.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/map_orderer.py @@ -43,6 +43,9 @@ def gt_set_iteration_order( validate: Perform validation at the end of the function. validate_all: Perform validation also on intermediate steps. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() + return sdfg.apply_transformations_once_everywhere( MapIterationOrder( unit_strides_dims=unit_strides_dim, diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/move_dataflow_into_if_body.py b/src/gt4py/next/program_processors/runners/dace/transformations/move_dataflow_into_if_body.py index 3b0f47c02c..90017cfded 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/move_dataflow_into_if_body.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/move_dataflow_into_if_body.py @@ -28,6 +28,17 @@ from gt4py.next.program_processors.runners.dace import transformations as gtx_transformations +def _node_sort_key(node: dace_nodes.Node) -> str: + """Return a deterministic string key for sorting DaCe nodes. + + Used to impose a stable iteration order on sets/collections of nodes, + preventing non-deterministic code generation caused by arbitrary set + iteration order. + """ + label = getattr(node, "data", getattr(node, "label", "")) + return f"{type(node).__name__}_{label}" + + @dace_properties.make_properties class MoveDataflowIntoIfBody(dace_transformation.SingleStateTransformation): """The transformation moves dataflow into the if branches. @@ -320,7 +331,7 @@ def _replicate_dataflow_into_branch( # Add the SDFGState to the key of the dictionary because we have to create # new node for the different branches. unique_old_nodes: list[dace_nodes.Node] = [] - for old_node in nodes_to_move: + for old_node in sorted(nodes_to_move, key=_node_sort_key): if (old_node, branch_state) in old_to_new_nodes_map: continue unique_old_nodes.append(old_node) @@ -838,7 +849,7 @@ def filter_nodes( def _partition_if_block( self, if_block: dace_nodes.NestedSDFG, - ) -> Optional[tuple[set[str], set[str]]]: + ) -> Optional[tuple[list[str], list[str]]]: """Check if `if_block` can be processed and partition the input connectors. The function will check if `if_block` has the right structure, i.e. if it is @@ -849,10 +860,10 @@ def _partition_if_block( Returns: If `if_block` is unsuitable the function will return `None`. If `if_block` meets the structural requirements the function will return - two sets of strings. The first set contains the connectors that can be - relocated and the second one of the conditions that can not be relocated. + two sorted lists of strings. The first list contains the connectors that + can be relocated and the second one the connectors that can not be relocated. + Sorting ensures deterministic downstream iteration order. """ - # TODO(phimuell): Change the return type to `tuple[list[str], list[str]]` and sort the connectors, such that the operation is deterministic. # There shall only be one output and three inputs with given names. if len(if_block.out_connectors.keys()) == 0: return None @@ -899,14 +910,14 @@ def _partition_if_block( # So the ones that can be relocated were found exactly once. Zero would # mean they can not be relocated and more than one means that we do not # support it yet. - relocatable_connectors = { + relocatable_connectors = sorted( conn_name for conn_name, conn_count in reference_count.items() if conn_count == 1 - } - non_relocatable_connectors = { + ) + non_relocatable_connectors = sorted( conn_name for conn_name in reference_count.keys() if conn_name not in relocatable_connectors - } + ) if len(non_relocatable_connectors) == 0: return None diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/multi_state_global_self_copy_elimination.py b/src/gt4py/next/program_processors/runners/dace/transformations/multi_state_global_self_copy_elimination.py index 7854299780..3e349b811c 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/multi_state_global_self_copy_elimination.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/multi_state_global_self_copy_elimination.py @@ -37,6 +37,9 @@ def gt_multi_state_global_self_copy_elimination( The function will also run `MultiStateGlobalSelfCopyElimination2`, but the results are merged together. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() + transforms = [ gtx_transformations.MultiStateGlobalSelfCopyElimination(), gtx_transformations.MultiStateGlobalSelfCopyElimination2(), diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/redundant_array_removers.py b/src/gt4py/next/program_processors/runners/dace/transformations/redundant_array_removers.py index 44a03a730f..b4bc8c4999 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/redundant_array_removers.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/redundant_array_removers.py @@ -41,6 +41,8 @@ def gt_remove_copy_chain( single_use_data: Which data descriptors are used only once. If not passed the function will run `FindSingleUseData`. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() # To ensures that the `{src,dst}_subset` are properly set, run initialization. # See [issue 1703](https://github.com/spcl/dace/issues/1703) diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/simplify.py b/src/gt4py/next/program_processors/runners/dace/transformations/simplify.py index 8c08f3459a..b9c5c22ca5 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/simplify.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/simplify.py @@ -10,7 +10,7 @@ import collections import copy -import uuid +import hashlib import warnings from typing import Any, Iterable, Optional, TypeAlias @@ -83,6 +83,9 @@ def gt_simplify( elimination at the end. The whole process is run inside a loop that ensures that `gt_simplify()` results in a fix point. """ + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() + # Ensure that `skip` is a `set` skip = gtx_transformations.constants.GT_SIMPLIFY_DEFAULT_SKIP_SET if skip is None else set(skip) @@ -476,6 +479,9 @@ def gt_reduce_distributed_buffering( validate_all: bool = False, ) -> Optional[dict[dace.SDFG, dict[dace.SDFGState, set[str]]]]: """Removes distributed write back buffers.""" + # Sort SDFG for deterministic pattern matching. + sdfg.sort_sdfg_alphabetically() + pipeline = dace_ppl.Pipeline([DistributedBufferRelocator()]) all_result = {} @@ -1007,8 +1013,15 @@ def apply( # This is the tasklet that we will put inside the map, note we have to do it # this way to avoid some name clash stuff. + # Use a deterministic hash instead of uuid.uuid1() to ensure stable code + # generation across runs. The hash combines properties that are unique to + # this specific clone context. + _clone_key = ( + f"{tasklet.label}_{tasklet.code.as_string}_{map_entry.label}_{connector_name}_{access_node.data}" + ) + _clone_hash = hashlib.md5(_clone_key.encode("utf-8")).hexdigest() inner_tasklet: dace_nodes.Tasklet = graph.add_tasklet( - name=f"{tasklet.label}__clone_{str(uuid.uuid1()).replace('-', '_')}", + name=f"{tasklet.label}__clone_{_clone_hash}", outputs=tasklet.out_connectors.keys(), inputs=set(), code=tasklet.code, diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/utils.py index 68a7c33201..98f2d1e81f 100644 --- a/src/gt4py/next/program_processors/runners/dace/transformations/utils.py +++ b/src/gt4py/next/program_processors/runners/dace/transformations/utils.py @@ -8,7 +8,7 @@ """Common functionality for the transformations/optimization pipeline.""" -import uuid +import itertools from typing import Optional, Sequence, TypeVar, Union import dace @@ -21,19 +21,25 @@ _PassT = TypeVar("_PassT", bound=dace_ppl.Pass) +_unique_name_counter = itertools.count() + + def unique_name(name: str) -> str: """Adds a unique string to `name`. + Uses a deterministic counter instead of a UUID to ensure stable, + reproducible names across consecutive compilations. + Note: - The names generates by this function are rather unstable and it should + The names generated by this function are rather unstable and it should not be used if a particular order should be enforced. This function is marked for deprecation. """ maximal_length = 200 - unique_sufix = str(uuid.uuid1()).replace("-", "_") - if len(name) > (maximal_length - len(unique_sufix)): - name = name[: (maximal_length - len(unique_sufix) - 1)] - return f"{name}_{unique_sufix}" + unique_suffix = str(next(_unique_name_counter)) + if len(name) > (maximal_length - len(unique_suffix)): + name = name[: (maximal_length - len(unique_suffix) - 1)] + return f"{name}_{unique_suffix}" def gt_make_transients_persistent( diff --git a/src/gt4py/next/program_processors/runners/dace/workflow/common.py b/src/gt4py/next/program_processors/runners/dace/workflow/common.py index cfb0d23596..4ce7bb1b33 100644 --- a/src/gt4py/next/program_processors/runners/dace/workflow/common.py +++ b/src/gt4py/next/program_processors/runners/dace/workflow/common.py @@ -66,6 +66,9 @@ def set_dace_config( # `gt4py.next.program_processors.runners.dace.transfromations.gpu_utils.gt_gpu_transform_non_standard_memlet()`. dace.Config.set("compiler.cuda.allow_implicit_memlet_to_map", value=False) + # Enable deterministic SDFG sorting for reproducible code generation. + dace.Config.set("compiler", "sdfg_alphabetical_sorting", value=True) + if cmake_build_type is not None: dace.Config.set("compiler.build_type", value=cmake_build_type.value)