Skip to content

multi_order_model

MultiOrderModel

MultiOrderModel based on torch_geometric.Data.

Source code in src/pathpyG/core/multi_order_model.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
class MultiOrderModel:
    """MultiOrderModel based on torch_geometric.Data."""

    def __init__(self) -> None:
        self.layers: dict[int, Graph] = {}

    def __str__(self) -> str:
        """Return a string representation of the higher-order graph."""
        max_order = max(list(self.layers.keys())) if self.layers else 0
        s = f"MultiOrderModel with max. order {max_order}"
        return s

    @staticmethod
    def iterate_lift_order(
        edge_index: torch.Tensor,
        node_sequence: torch.Tensor,
        mapping: IndexMap,
        edge_weight: torch.Tensor | None = None,
        aggr: str = "src",
        save: bool = True,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, Graph | None]:
        """Lift order by one and save the result in the layers dictionary of the object.
        This is a helper function that should not be called directly.
        Only use for edge_indices after the special cases have been handled e.g.
        in the from_temporal_graph (filtering non-time-respecting paths of order 2).

        Args:
            edge_index: The edge index of the (k-1)-th order graph.
            node_sequence: The node sequences of the (k-1)-th order graph.
            edge_weight: The edge weights of the (k-1)-th order graph.
            k: The order of the graph that should be computed.
            aggr: The aggregation method to use. One of "src", "dst", "max", "mul".
            save: Whether to compute the aggregated graph and later save it in the layers dictionary.
        """
        # Lift order
        if edge_weight is None:
            ho_index = lift_order_edge_index(edge_index, num_nodes=node_sequence.size(0))
        else:
            ho_index, edge_weight = lift_order_edge_index_weighted(
                edge_index, edge_weight=edge_weight, num_nodes=node_sequence.size(0), aggr=aggr
            )
        node_sequence = torch.cat([node_sequence[edge_index[0]], node_sequence[edge_index[1]][:, -1:]], dim=1)

        # Aggregate
        if save:
            gk = aggregate_edge_index(ho_index, node_sequence, edge_weight)
            gk.mapping = IndexMap([tuple(mapping.to_ids(v.cpu())) for v in gk.data.node_sequence])
        else:
            gk = None
        return ho_index, node_sequence, edge_weight, gk

    @staticmethod
    def from_temporal_graph(
        g: TemporalGraph, delta: float | int = 1,
        max_order: int = 1,
        weight: str = "edge_weight",
        cached: bool = True,
        event_graph: torch.Tensor = None
    ) -> MultiOrderModel:
        """Creates multiple higher-order De Bruijn graph models for paths in a temporal graph.

        Args:
            g: The temporal graph.
            delta: The maximum time difference between two consecutive edges in a path.
            max_order: The maximum order of the MultiOrderModel that should be computed.
            weight: The edge attribute to use as edge weight.
            cached: Whether to save the aggregated higher-order graphs smaller than max order in the MultiOrderModel.
            event_graph: precomputed event graph edge index for given delta to be used for model generation. Useful to prevent the same event graph 
            from being computed twice.

        Returns:
            MultiOrderModel: A multi-order model where each layer is a De Bruijn graph with order k.
        """
        m = MultiOrderModel()
        if not g.data.is_sorted_by_time():
            data = g.data.sort_by_time()
        else:
            data = g.data
        edge_index = data.edge_index
        node_sequence = torch.arange(data.num_nodes, device=edge_index.device).unsqueeze(1)
        if weight in data:
            edge_weight = data[weight]
        else:
            edge_weight = torch.ones(edge_index.size(1), device=edge_index.device)
        if cached or max_order == 1:
            m.layers[1] = aggregate_edge_index(
                edge_index=edge_index, node_sequence=node_sequence, edge_weight=edge_weight
            )
            m.layers[1].mapping = g.mapping

        if max_order > 1:
            node_sequence = torch.cat([node_sequence[edge_index[0]], node_sequence[edge_index[1]][:, -1:]], dim=1)
            if event_graph is None:
                edge_index = lift_order_temporal(g, delta)
            else:
                edge_index = event_graph
            edge_weight = aggregate_node_attributes(edge_index, edge_weight, "src")

            # Aggregate
            if cached or max_order == 2:
                m.layers[2] = aggregate_edge_index(
                    edge_index=edge_index, node_sequence=node_sequence, edge_weight=edge_weight
                )
                m.layers[2].mapping = IndexMap(
                    [tuple(g.mapping.to_ids(v.cpu())) for v in m.layers[2].data.node_sequence]
                )

            for k in range(3, max_order + 1):
                edge_index, node_sequence, edge_weight, gk = MultiOrderModel.iterate_lift_order(
                    edge_index=edge_index,
                    node_sequence=node_sequence,
                    mapping=g.mapping,
                    edge_weight=edge_weight,
                    aggr="src",
                    save=cached or k == max_order,
                )
                if cached or k == max_order:
                    m.layers[k] = gk

        return m

    @staticmethod
    def from_PathData(
        path_data: PathData, max_order: int = 1, mode: str = "propagation", cached: bool = True
    ) -> MultiOrderModel:
        """
        Creates multiple higher-order De Bruijn graphs modelling paths in PathData.

        Args:
            path_data: `PathData` object containing paths as list of PyG Data objects
                with sorted edge indices, node sequences and num_nodes.
            max_order: The maximum order of the MultiOrderModel that should be computed
            mode: The process that we assume. Can be "diffusion" or "propagation".
            cached: Whether to save the aggregated higher-order graphs smaller than max order
                in the MultiOrderModel.

        Returns:
            MultiOrderModel: The MultiOrderModel.
        """
        m = MultiOrderModel()

        # We assume that paths are sorted
        path_graph = path_data.data
        edge_index = path_graph.edge_index
        node_sequence = path_graph.node_sequence
        edge_weight = path_graph.dag_weight.repeat_interleave(path_graph.dag_num_edges)
        if mode == "diffusion":
            edge_weight = (
                edge_weight / degree(edge_index[0], dtype=torch.long, num_nodes=node_sequence.size(0))[edge_index[0]]
            )
            aggr = "mul"
        elif mode == "propagation":
            aggr = "src"

        m.layers[1] = aggregate_edge_index(edge_index=edge_index, node_sequence=node_sequence, edge_weight=edge_weight)
        m.layers[1].mapping = path_data.mapping

        for k in range(2, max_order + 1):
            edge_index, node_sequence, edge_weight, gk = MultiOrderModel.iterate_lift_order(
                edge_index=edge_index,
                node_sequence=node_sequence,
                mapping=m.layers[1].mapping,
                edge_weight=edge_weight,
                aggr=aggr,
                save=cached or k == max_order,
            )
            if cached or k == max_order:
                m.layers[k] = gk

        return m

    def get_mon_dof(self, max_order: int = None, assumption: str = "paths") -> int:
        """
        The degrees of freedom for the kth layer of a multi-order model. This depends on the number of different paths of exactly length `k` in the graph.
        Therefore, we can obtain these values by summing the entries of the `k`-th power of the binary adjacency matrix of the graph.
        Finally, we must consider that, due the conservation of probablility, all non-zero rows of the transition matrix of the higher-order network must sum to one.
        This poses one additional constraint per row that respects the condition, which should be removed from the total count of degrees of freedom.

        Args:
            m (MultiOrderModel): The multi-order model.
            max_order (int, optional): The maximum order up to which model layers
                shall be taken into account. Defaults to None, meaning it considers
                all available layers.
            assumption (str, optional): If set to 'paths', only paths in the
                first-order network topology will be considered for the degree of
                freedom calculation. If set to 'ngrams', all possible n-grams will
                be considered, independent of whether they are valid paths in the
                first-order network or not. Defaults to 'paths'.

        Returns:
            int: The degrees of freedom for the multi-order model.

        Raises:
            AssertionError: If max_order is larger than the maximum order of
                the multi-order network.
            ValueError: If the assumption is not 'paths' or 'ngrams'.
        """
        if max_order is None:
            max_order = max(self.layers)

        assert max_order <= max(
            self.layers
        ), "Error: max_order cannot be larger than maximum order of multi-order network"

        dof = self.layers[1].data.num_nodes - 1  # Degrees of freedom for zeroth order

        if assumption == "paths":
            # COMPUTING CONTRIBUTION FROM NUM PATHS AND NONZERO OUTDEGREES SEPARATELY
            # TODO: CAN IT BE DONE TOGETHER?

            edge_index = self.layers[1].data.edge_index
            # Adding dof from Number of paths of length k
            for k in range(1, max_order + 1):
                if k > 1:
                    num_nodes = 0 if edge_index.numel() == 0 else edge_index.max().item() + 1
                    edge_index = lift_order_edge_index(edge_index, num_nodes)
                # counting number of len k paths
                num_len_k_paths = edge_index.shape[1]  # edge_index.max().item() +1  # Number of paths of length k
                dof += num_len_k_paths

            # removing dof from total probability of nonzero degree nodes
            for k in range(1, max_order + 1):
                if k == 1:
                    # edge_index of temporal graph is sorted by time by default
                    # For matrix multiplication, we need to sort it by row
                    edge_index_adj = self.layers[1].data.edge_index.sort_by("row")[0]
                    edge_index = edge_index_adj
                else:
                    edge_index, _ = edge_index.matmul(edge_index_adj)
                num_nonzero_outdegrees = torch.unique(edge_index[0]).size(0)
                dof -= num_nonzero_outdegrees

        elif assumption == "ngrams":
            for order in range(1, max_order + 1):
                dof += (self.layers[1].data.num_nodes ** order) * (self.layers[1].data.num_nodes - 1)
        else:
            raise ValueError(
                f"Unknown assumption {assumption} in input. The only accepted values are 'path' and 'ngram'"
            )

        return int(dof)

    def get_zeroth_order_log_likelihood(self, dag_graph: Data) -> float:
        """
        Compute the zeroth order log likelihood.

        Args:
            dag_graph (Data): Input DAG graph data.

        Returns:
            float: Zeroth order log likelihood.
        """
        # Get frequencies
        # getting the index of the last edge of each path (to be used to extract weights)
        frequencies = dag_graph.dag_weight

        # Get ixs starting nodes
        # Q: Is dag_graph.path_index[:-1] enough to get the start_ixs?
        mask = torch.ones(dag_graph.num_nodes, dtype=bool)
        mask[dag_graph.edge_index[1]] = False
        start_ixs = dag_graph.node_sequence.squeeze()[mask]

        # Compute node emission probabilities
        # TODOL modify once we have zeroth order in mon
        _, counts = torch.unique(dag_graph.node_sequence, return_counts=True)
        # WARNING: Only works if all nodes in the first-order graph are also in `node_sequence`
        # Otherwise the missing nodes will not be included in `counts` which can lead to elements at the wrong index.
        node_emission_probabilities = counts / counts.sum()
        return torch.mul(frequencies, torch.log(node_emission_probabilities[start_ixs])).sum().item()

    def get_intermediate_order_log_likelihood(self, dag_graph: Data, order: int) -> float:
        """
        Compute the intermediate order log likelihood.

        Args:
            m (MultiOrderModel): Multi-order model.
            dag_graph (Data): Input DAG graph data.
            order (int): Order of the intermediate log likelihood.

        Returns:
            float: Intermediate order log likelihood.
        """
        # Get frequencies
        frequencies = dag_graph.dag_weight
        path_lengths = dag_graph.dag_num_nodes
        # paths shrink by 'order' if we encode them using higher-order nodes
        paths_lenghts_ho = path_lengths - order
        # selecting only path that didn t shrink to zero due to higher-order transformation
        paths_lenghts_ho_filtered = paths_lenghts_ho[paths_lenghts_ho > 0]
        frequencies = frequencies[paths_lenghts_ho > 0]
        # start index of the path in the higher order space 
        ixs_start_paths_ho = cumsum(paths_lenghts_ho_filtered)[:-1]

        transition_probabilities = self.layers[order].transition_probabilities()[
            self.layers[order + 1].data.inverse_idx[ixs_start_paths_ho]
            ]

        log_transition_probabilities = torch.log(transition_probabilities)
        llh_by_subpath = torch.mul(frequencies, log_transition_probabilities)
        return llh_by_subpath.sum().item()

    def get_mon_log_likelihood(self, dag_graph: Data, max_order: int = 1) -> float:
        """
        Compute the likelihood of the walks given a multi-order model.

        Args:
            m (MultiOrderModel): The multi-order model.
            dag_graph (Data): Dataset containing the walks.
            max_order (int, optional): The maximum order up to which model layers
                shall be taken into account. Defaults to 1.

        Returns:
            float: The log likelihood of the walks given the multi-order model.
        """
        llh = 0

        # Adding likelihood of zeroth order
        llh += self.get_zeroth_order_log_likelihood(dag_graph)

        # Adding the likelihood for all the intermediate orders
        for order in range(1, max_order):
            llh += self.get_intermediate_order_log_likelihood(dag_graph, order)

        # Adding the likelihood of highest/stationary order
        if max_order > 0:
            transition_probabilities = self.layers[max_order].transition_probabilities()
            log_transition_probabilities = torch.log(transition_probabilities)
            llh_by_subpath = log_transition_probabilities * self.layers[max_order].data.edge_weight
            llh += llh_by_subpath.sum().item()
        else:
            # Compute likelihood for zeroth order (to be modified)
            # TODO: modify once we have zeroth order in mon
            # (then won t need to compute emission probs from dag_graph -- which also hinders us from computing the lh that a new set of paths was generated by the model)
            frequencies = dag_graph.dag_weight
            counts = torch.bincount(
                dag_graph.node_sequence.squeeze(), frequencies.repeat_interleave(dag_graph.dag_num_nodes)
            )
            node_emission_probabilities = counts / counts.sum()
            llh = torch.mul(torch.log(node_emission_probabilities), counts).sum().item()

        return llh

    def likelihood_ratio_test(
        self,
        dag_graph: Data,
        max_order_null: int = 0,
        max_order: int = 1,
        assumption: str = "paths",
        significance_threshold: float = 0.01,
    ) -> tuple:
        """
        Perform a likelihood ratio test to compare two models of different order.

        Args:
            dag_graph (Data): The input DAG graph data.
            max_order_null (int, optional): The maximum order of the null hypothesis model.
                Defaults to 0.
            max_order (int, optional): The maximum order of the alternative hypothesis model.
                Defaults to 1.
            assumption (str, optional): The assumption to use for the degrees of freedom calculation.
                Can be 'paths' or 'ngrams'. Defaults to 'paths'.
            significance_threshold (float, optional): The significance threshold for the test.
                Defaults to 0.01.

        Returns:
            tuple: A tuple containing a boolean indicating whether the null hypothesis is rejected
                and the p-value of the test.
        """
        assert (
            max_order_null < max_order
        ), "Error: order of null hypothesis must be smaller than order of alternative hypothesis"
        assert max_order <= max(
            self.layers
        ), f"Error: order of hypotheses ({max_order_null} and {max_order}) must be smaller than the maximum order of the MultiOrderModel {max(self.layers)}"
        # let L0 be the likelihood for the null model and L1 be the likelihood for the alternative model

        # we first compute a test statistic x = -2 * log (L0/L1) = -2 * (log L0 - log L1)
        x = -2 * (
            self.get_mon_log_likelihood(dag_graph, max_order=max_order_null)
            - self.get_mon_log_likelihood(dag_graph, max_order=max_order)
        )

        # we calculate the additional degrees of freedom in the alternative model
        dof_diff = self.get_mon_dof(max_order, assumption=assumption) - self.get_mon_dof(
            max_order_null, assumption=assumption
        )

        # if the p-value is *below* the significance threshold, we reject the null hypothesis
        p = 1 - chi2.cdf(x, dof_diff)
        return (p < significance_threshold), p

    def estimate_order(self, dag_data: PathData, max_order: int = None, significance_threshold: float = 0.01) -> int:
        """
        Selects the optimal maximum order of a multi-order network model for the
        observed paths, based on a likelihood ratio test with p-value threshold of p
        By default, all orders up to the maximum order of the multi-order model will be tested.

        Args:
            dag_data (DAGData): The path statistics data for which to estimate the optimal order.
            max_order (int, optional): The maximum order to consider during the estimation process.
                If not provided, the maximum order of the multi-order model is used.
            significance_threshold (float, optional): The p-value threshold for the likelihood ratio test.
                An order is accepted if the improvement in likelihood is significant at this threshold.

        Returns:
            int: The estimated optimal maximum order for the multi-order network model.

        Raises:
            AssertionError: If the provided max_order is larger than the maximum order of the multi-order model
                or if the input DAGData does not have the same set of nodes as the multi-order network
        """
        if max_order is None:
            max_order = max(self.layers)  # THIS
        assert max_order <= max(
            self.layers
        ), "Error: maxOrder cannot be larger than maximum order of multi-order network"
        assert max_order > 1, "Error: max_order must be larger than one"

        assert set(dag_data.mapping.node_ids).intersection(set(self.layers[1].mapping.node_ids)) == set(
            dag_data.mapping.node_ids
        ), "Input DAGData doesn t have the same set of nodes as those of the multi-order network"

        max_accepted_order = 1
        dag_graph = dag_data.data

        # Test for highest order that passes
        # likelihood ratio test against null model
        for k in range(2, max_order + 1):
            if self.likelihood_ratio_test(
                dag_graph, max_order_null=k - 1, max_order=k, significance_threshold=significance_threshold
            )[0]:
                max_accepted_order = k

        return max_accepted_order

    def to_dbgnn_data(self, max_order: int = 2, mapping: str = "last") -> Data:
        """
        Convert the MultiOrderModel to a De Bruijn graph for the given maximum order
        that can be used in `pathpyG.nn.dbgnn.DBGNN`.

        Args:
            max_order: The maximum order of the De Bruijn graph to be computed.
            mapping: The mapping to use for the bipartite edge index. One of "last", "first", or "both".

        Returns:
            Data: The De Bruijn graph data.
        """
        if max_order not in self.layers:
            raise ValueError(f"Higher-order graph of order {max_order} not found.")

        g = self.layers[1]
        g_max_order = self.layers[max_order]
        num_nodes = g.data.num_nodes
        num_ho_nodes = g_max_order.data.num_nodes
        if g.data.x is not None:
            x = g.data.x
        else:
            x = torch.eye(num_nodes, num_nodes)
        x_max_order = torch.eye(num_ho_nodes, num_ho_nodes)
        edge_index = g.data.edge_index
        edge_index_max_order = g_max_order.data.edge_index
        edge_weight = g.data.edge_weight
        edge_weight_max_order = g_max_order.data.edge_weight
        bipartite_edge_index = generate_bipartite_edge_index(g, g_max_order, mapping=mapping)

        if g.data.y is not None:
            y = g.data.y

        return Data(
            num_nodes=num_nodes,
            num_ho_nodes=num_ho_nodes,
            x=x,
            x_h=x_max_order,
            edge_index=edge_index,
            edge_index_higher_order=edge_index_max_order,
            edge_weights=edge_weight.float(),
            edge_weights_higher_order=edge_weight_max_order.float(),
            bipartite_edge_index=bipartite_edge_index,
            y=y if "y" in locals() else None,
        )

__str__

Return a string representation of the higher-order graph.

Source code in src/pathpyG/core/multi_order_model.py
def __str__(self) -> str:
    """Return a string representation of the higher-order graph."""
    max_order = max(list(self.layers.keys())) if self.layers else 0
    s = f"MultiOrderModel with max. order {max_order}"
    return s

estimate_order

Selects the optimal maximum order of a multi-order network model for the observed paths, based on a likelihood ratio test with p-value threshold of p By default, all orders up to the maximum order of the multi-order model will be tested.

Parameters:

Name Type Description Default
dag_data DAGData

The path statistics data for which to estimate the optimal order.

required
max_order int

The maximum order to consider during the estimation process. If not provided, the maximum order of the multi-order model is used.

None
significance_threshold float

The p-value threshold for the likelihood ratio test. An order is accepted if the improvement in likelihood is significant at this threshold.

0.01

Returns:

Name Type Description
int int

The estimated optimal maximum order for the multi-order network model.

Raises:

Type Description
AssertionError

If the provided max_order is larger than the maximum order of the multi-order model or if the input DAGData does not have the same set of nodes as the multi-order network

Source code in src/pathpyG/core/multi_order_model.py
def estimate_order(self, dag_data: PathData, max_order: int = None, significance_threshold: float = 0.01) -> int:
    """
    Selects the optimal maximum order of a multi-order network model for the
    observed paths, based on a likelihood ratio test with p-value threshold of p
    By default, all orders up to the maximum order of the multi-order model will be tested.

    Args:
        dag_data (DAGData): The path statistics data for which to estimate the optimal order.
        max_order (int, optional): The maximum order to consider during the estimation process.
            If not provided, the maximum order of the multi-order model is used.
        significance_threshold (float, optional): The p-value threshold for the likelihood ratio test.
            An order is accepted if the improvement in likelihood is significant at this threshold.

    Returns:
        int: The estimated optimal maximum order for the multi-order network model.

    Raises:
        AssertionError: If the provided max_order is larger than the maximum order of the multi-order model
            or if the input DAGData does not have the same set of nodes as the multi-order network
    """
    if max_order is None:
        max_order = max(self.layers)  # THIS
    assert max_order <= max(
        self.layers
    ), "Error: maxOrder cannot be larger than maximum order of multi-order network"
    assert max_order > 1, "Error: max_order must be larger than one"

    assert set(dag_data.mapping.node_ids).intersection(set(self.layers[1].mapping.node_ids)) == set(
        dag_data.mapping.node_ids
    ), "Input DAGData doesn t have the same set of nodes as those of the multi-order network"

    max_accepted_order = 1
    dag_graph = dag_data.data

    # Test for highest order that passes
    # likelihood ratio test against null model
    for k in range(2, max_order + 1):
        if self.likelihood_ratio_test(
            dag_graph, max_order_null=k - 1, max_order=k, significance_threshold=significance_threshold
        )[0]:
            max_accepted_order = k

    return max_accepted_order

from_PathData staticmethod

Creates multiple higher-order De Bruijn graphs modelling paths in PathData.

Parameters:

Name Type Description Default
path_data pathpyG.core.path_data.PathData

PathData object containing paths as list of PyG Data objects with sorted edge indices, node sequences and num_nodes.

required
max_order int

The maximum order of the MultiOrderModel that should be computed

1
mode str

The process that we assume. Can be "diffusion" or "propagation".

'propagation'
cached bool

Whether to save the aggregated higher-order graphs smaller than max order in the MultiOrderModel.

True

Returns:

Name Type Description
MultiOrderModel pathpyG.core.multi_order_model.MultiOrderModel

The MultiOrderModel.

Source code in src/pathpyG/core/multi_order_model.py
@staticmethod
def from_PathData(
    path_data: PathData, max_order: int = 1, mode: str = "propagation", cached: bool = True
) -> MultiOrderModel:
    """
    Creates multiple higher-order De Bruijn graphs modelling paths in PathData.

    Args:
        path_data: `PathData` object containing paths as list of PyG Data objects
            with sorted edge indices, node sequences and num_nodes.
        max_order: The maximum order of the MultiOrderModel that should be computed
        mode: The process that we assume. Can be "diffusion" or "propagation".
        cached: Whether to save the aggregated higher-order graphs smaller than max order
            in the MultiOrderModel.

    Returns:
        MultiOrderModel: The MultiOrderModel.
    """
    m = MultiOrderModel()

    # We assume that paths are sorted
    path_graph = path_data.data
    edge_index = path_graph.edge_index
    node_sequence = path_graph.node_sequence
    edge_weight = path_graph.dag_weight.repeat_interleave(path_graph.dag_num_edges)
    if mode == "diffusion":
        edge_weight = (
            edge_weight / degree(edge_index[0], dtype=torch.long, num_nodes=node_sequence.size(0))[edge_index[0]]
        )
        aggr = "mul"
    elif mode == "propagation":
        aggr = "src"

    m.layers[1] = aggregate_edge_index(edge_index=edge_index, node_sequence=node_sequence, edge_weight=edge_weight)
    m.layers[1].mapping = path_data.mapping

    for k in range(2, max_order + 1):
        edge_index, node_sequence, edge_weight, gk = MultiOrderModel.iterate_lift_order(
            edge_index=edge_index,
            node_sequence=node_sequence,
            mapping=m.layers[1].mapping,
            edge_weight=edge_weight,
            aggr=aggr,
            save=cached or k == max_order,
        )
        if cached or k == max_order:
            m.layers[k] = gk

    return m

from_temporal_graph staticmethod

Creates multiple higher-order De Bruijn graph models for paths in a temporal graph.

Parameters:

Name Type Description Default
g pathpyG.core.temporal_graph.TemporalGraph

The temporal graph.

required
delta float | int

The maximum time difference between two consecutive edges in a path.

1
max_order int

The maximum order of the MultiOrderModel that should be computed.

1
weight str

The edge attribute to use as edge weight.

'edge_weight'
cached bool

Whether to save the aggregated higher-order graphs smaller than max order in the MultiOrderModel.

True
event_graph torch.Tensor

precomputed event graph edge index for given delta to be used for model generation. Useful to prevent the same event graph

None

Returns:

Name Type Description
MultiOrderModel pathpyG.core.multi_order_model.MultiOrderModel

A multi-order model where each layer is a De Bruijn graph with order k.

Source code in src/pathpyG/core/multi_order_model.py
@staticmethod
def from_temporal_graph(
    g: TemporalGraph, delta: float | int = 1,
    max_order: int = 1,
    weight: str = "edge_weight",
    cached: bool = True,
    event_graph: torch.Tensor = None
) -> MultiOrderModel:
    """Creates multiple higher-order De Bruijn graph models for paths in a temporal graph.

    Args:
        g: The temporal graph.
        delta: The maximum time difference between two consecutive edges in a path.
        max_order: The maximum order of the MultiOrderModel that should be computed.
        weight: The edge attribute to use as edge weight.
        cached: Whether to save the aggregated higher-order graphs smaller than max order in the MultiOrderModel.
        event_graph: precomputed event graph edge index for given delta to be used for model generation. Useful to prevent the same event graph 
        from being computed twice.

    Returns:
        MultiOrderModel: A multi-order model where each layer is a De Bruijn graph with order k.
    """
    m = MultiOrderModel()
    if not g.data.is_sorted_by_time():
        data = g.data.sort_by_time()
    else:
        data = g.data
    edge_index = data.edge_index
    node_sequence = torch.arange(data.num_nodes, device=edge_index.device).unsqueeze(1)
    if weight in data:
        edge_weight = data[weight]
    else:
        edge_weight = torch.ones(edge_index.size(1), device=edge_index.device)
    if cached or max_order == 1:
        m.layers[1] = aggregate_edge_index(
            edge_index=edge_index, node_sequence=node_sequence, edge_weight=edge_weight
        )
        m.layers[1].mapping = g.mapping

    if max_order > 1:
        node_sequence = torch.cat([node_sequence[edge_index[0]], node_sequence[edge_index[1]][:, -1:]], dim=1)
        if event_graph is None:
            edge_index = lift_order_temporal(g, delta)
        else:
            edge_index = event_graph
        edge_weight = aggregate_node_attributes(edge_index, edge_weight, "src")

        # Aggregate
        if cached or max_order == 2:
            m.layers[2] = aggregate_edge_index(
                edge_index=edge_index, node_sequence=node_sequence, edge_weight=edge_weight
            )
            m.layers[2].mapping = IndexMap(
                [tuple(g.mapping.to_ids(v.cpu())) for v in m.layers[2].data.node_sequence]
            )

        for k in range(3, max_order + 1):
            edge_index, node_sequence, edge_weight, gk = MultiOrderModel.iterate_lift_order(
                edge_index=edge_index,
                node_sequence=node_sequence,
                mapping=g.mapping,
                edge_weight=edge_weight,
                aggr="src",
                save=cached or k == max_order,
            )
            if cached or k == max_order:
                m.layers[k] = gk

    return m

get_intermediate_order_log_likelihood

Compute the intermediate order log likelihood.

Parameters:

Name Type Description Default
m pathpyG.core.multi_order_model.MultiOrderModel

Multi-order model.

required
dag_graph torch_geometric.data.Data

Input DAG graph data.

required
order int

Order of the intermediate log likelihood.

required

Returns:

Name Type Description
float float

Intermediate order log likelihood.

Source code in src/pathpyG/core/multi_order_model.py
def get_intermediate_order_log_likelihood(self, dag_graph: Data, order: int) -> float:
    """
    Compute the intermediate order log likelihood.

    Args:
        m (MultiOrderModel): Multi-order model.
        dag_graph (Data): Input DAG graph data.
        order (int): Order of the intermediate log likelihood.

    Returns:
        float: Intermediate order log likelihood.
    """
    # Get frequencies
    frequencies = dag_graph.dag_weight
    path_lengths = dag_graph.dag_num_nodes
    # paths shrink by 'order' if we encode them using higher-order nodes
    paths_lenghts_ho = path_lengths - order
    # selecting only path that didn t shrink to zero due to higher-order transformation
    paths_lenghts_ho_filtered = paths_lenghts_ho[paths_lenghts_ho > 0]
    frequencies = frequencies[paths_lenghts_ho > 0]
    # start index of the path in the higher order space 
    ixs_start_paths_ho = cumsum(paths_lenghts_ho_filtered)[:-1]

    transition_probabilities = self.layers[order].transition_probabilities()[
        self.layers[order + 1].data.inverse_idx[ixs_start_paths_ho]
        ]

    log_transition_probabilities = torch.log(transition_probabilities)
    llh_by_subpath = torch.mul(frequencies, log_transition_probabilities)
    return llh_by_subpath.sum().item()

get_mon_dof

The degrees of freedom for the kth layer of a multi-order model. This depends on the number of different paths of exactly length k in the graph. Therefore, we can obtain these values by summing the entries of the k-th power of the binary adjacency matrix of the graph. Finally, we must consider that, due the conservation of probablility, all non-zero rows of the transition matrix of the higher-order network must sum to one. This poses one additional constraint per row that respects the condition, which should be removed from the total count of degrees of freedom.

Parameters:

Name Type Description Default
m pathpyG.core.multi_order_model.MultiOrderModel

The multi-order model.

required
max_order int

The maximum order up to which model layers shall be taken into account. Defaults to None, meaning it considers all available layers.

None
assumption str

If set to 'paths', only paths in the first-order network topology will be considered for the degree of freedom calculation. If set to 'ngrams', all possible n-grams will be considered, independent of whether they are valid paths in the first-order network or not. Defaults to 'paths'.

'paths'

Returns:

Name Type Description
int int

The degrees of freedom for the multi-order model.

Raises:

Type Description
AssertionError

If max_order is larger than the maximum order of the multi-order network.

ValueError

If the assumption is not 'paths' or 'ngrams'.

Source code in src/pathpyG/core/multi_order_model.py
def get_mon_dof(self, max_order: int = None, assumption: str = "paths") -> int:
    """
    The degrees of freedom for the kth layer of a multi-order model. This depends on the number of different paths of exactly length `k` in the graph.
    Therefore, we can obtain these values by summing the entries of the `k`-th power of the binary adjacency matrix of the graph.
    Finally, we must consider that, due the conservation of probablility, all non-zero rows of the transition matrix of the higher-order network must sum to one.
    This poses one additional constraint per row that respects the condition, which should be removed from the total count of degrees of freedom.

    Args:
        m (MultiOrderModel): The multi-order model.
        max_order (int, optional): The maximum order up to which model layers
            shall be taken into account. Defaults to None, meaning it considers
            all available layers.
        assumption (str, optional): If set to 'paths', only paths in the
            first-order network topology will be considered for the degree of
            freedom calculation. If set to 'ngrams', all possible n-grams will
            be considered, independent of whether they are valid paths in the
            first-order network or not. Defaults to 'paths'.

    Returns:
        int: The degrees of freedom for the multi-order model.

    Raises:
        AssertionError: If max_order is larger than the maximum order of
            the multi-order network.
        ValueError: If the assumption is not 'paths' or 'ngrams'.
    """
    if max_order is None:
        max_order = max(self.layers)

    assert max_order <= max(
        self.layers
    ), "Error: max_order cannot be larger than maximum order of multi-order network"

    dof = self.layers[1].data.num_nodes - 1  # Degrees of freedom for zeroth order

    if assumption == "paths":
        # COMPUTING CONTRIBUTION FROM NUM PATHS AND NONZERO OUTDEGREES SEPARATELY
        # TODO: CAN IT BE DONE TOGETHER?

        edge_index = self.layers[1].data.edge_index
        # Adding dof from Number of paths of length k
        for k in range(1, max_order + 1):
            if k > 1:
                num_nodes = 0 if edge_index.numel() == 0 else edge_index.max().item() + 1
                edge_index = lift_order_edge_index(edge_index, num_nodes)
            # counting number of len k paths
            num_len_k_paths = edge_index.shape[1]  # edge_index.max().item() +1  # Number of paths of length k
            dof += num_len_k_paths

        # removing dof from total probability of nonzero degree nodes
        for k in range(1, max_order + 1):
            if k == 1:
                # edge_index of temporal graph is sorted by time by default
                # For matrix multiplication, we need to sort it by row
                edge_index_adj = self.layers[1].data.edge_index.sort_by("row")[0]
                edge_index = edge_index_adj
            else:
                edge_index, _ = edge_index.matmul(edge_index_adj)
            num_nonzero_outdegrees = torch.unique(edge_index[0]).size(0)
            dof -= num_nonzero_outdegrees

    elif assumption == "ngrams":
        for order in range(1, max_order + 1):
            dof += (self.layers[1].data.num_nodes ** order) * (self.layers[1].data.num_nodes - 1)
    else:
        raise ValueError(
            f"Unknown assumption {assumption} in input. The only accepted values are 'path' and 'ngram'"
        )

    return int(dof)

get_mon_log_likelihood

Compute the likelihood of the walks given a multi-order model.

Parameters:

Name Type Description Default
m pathpyG.core.multi_order_model.MultiOrderModel

The multi-order model.

required
dag_graph torch_geometric.data.Data

Dataset containing the walks.

required
max_order int

The maximum order up to which model layers shall be taken into account. Defaults to 1.

1

Returns:

Name Type Description
float float

The log likelihood of the walks given the multi-order model.

Source code in src/pathpyG/core/multi_order_model.py
def get_mon_log_likelihood(self, dag_graph: Data, max_order: int = 1) -> float:
    """
    Compute the likelihood of the walks given a multi-order model.

    Args:
        m (MultiOrderModel): The multi-order model.
        dag_graph (Data): Dataset containing the walks.
        max_order (int, optional): The maximum order up to which model layers
            shall be taken into account. Defaults to 1.

    Returns:
        float: The log likelihood of the walks given the multi-order model.
    """
    llh = 0

    # Adding likelihood of zeroth order
    llh += self.get_zeroth_order_log_likelihood(dag_graph)

    # Adding the likelihood for all the intermediate orders
    for order in range(1, max_order):
        llh += self.get_intermediate_order_log_likelihood(dag_graph, order)

    # Adding the likelihood of highest/stationary order
    if max_order > 0:
        transition_probabilities = self.layers[max_order].transition_probabilities()
        log_transition_probabilities = torch.log(transition_probabilities)
        llh_by_subpath = log_transition_probabilities * self.layers[max_order].data.edge_weight
        llh += llh_by_subpath.sum().item()
    else:
        # Compute likelihood for zeroth order (to be modified)
        # TODO: modify once we have zeroth order in mon
        # (then won t need to compute emission probs from dag_graph -- which also hinders us from computing the lh that a new set of paths was generated by the model)
        frequencies = dag_graph.dag_weight
        counts = torch.bincount(
            dag_graph.node_sequence.squeeze(), frequencies.repeat_interleave(dag_graph.dag_num_nodes)
        )
        node_emission_probabilities = counts / counts.sum()
        llh = torch.mul(torch.log(node_emission_probabilities), counts).sum().item()

    return llh

get_zeroth_order_log_likelihood

Compute the zeroth order log likelihood.

Parameters:

Name Type Description Default
dag_graph torch_geometric.data.Data

Input DAG graph data.

required

Returns:

Name Type Description
float float

Zeroth order log likelihood.

Source code in src/pathpyG/core/multi_order_model.py
def get_zeroth_order_log_likelihood(self, dag_graph: Data) -> float:
    """
    Compute the zeroth order log likelihood.

    Args:
        dag_graph (Data): Input DAG graph data.

    Returns:
        float: Zeroth order log likelihood.
    """
    # Get frequencies
    # getting the index of the last edge of each path (to be used to extract weights)
    frequencies = dag_graph.dag_weight

    # Get ixs starting nodes
    # Q: Is dag_graph.path_index[:-1] enough to get the start_ixs?
    mask = torch.ones(dag_graph.num_nodes, dtype=bool)
    mask[dag_graph.edge_index[1]] = False
    start_ixs = dag_graph.node_sequence.squeeze()[mask]

    # Compute node emission probabilities
    # TODOL modify once we have zeroth order in mon
    _, counts = torch.unique(dag_graph.node_sequence, return_counts=True)
    # WARNING: Only works if all nodes in the first-order graph are also in `node_sequence`
    # Otherwise the missing nodes will not be included in `counts` which can lead to elements at the wrong index.
    node_emission_probabilities = counts / counts.sum()
    return torch.mul(frequencies, torch.log(node_emission_probabilities[start_ixs])).sum().item()

iterate_lift_order staticmethod

Lift order by one and save the result in the layers dictionary of the object. This is a helper function that should not be called directly. Only use for edge_indices after the special cases have been handled e.g. in the from_temporal_graph (filtering non-time-respecting paths of order 2).

Parameters:

Name Type Description Default
edge_index torch.Tensor

The edge index of the (k-1)-th order graph.

required
node_sequence torch.Tensor

The node sequences of the (k-1)-th order graph.

required
edge_weight torch.Tensor | None

The edge weights of the (k-1)-th order graph.

None
k

The order of the graph that should be computed.

required
aggr str

The aggregation method to use. One of "src", "dst", "max", "mul".

'src'
save bool

Whether to compute the aggregated graph and later save it in the layers dictionary.

True
Source code in src/pathpyG/core/multi_order_model.py
@staticmethod
def iterate_lift_order(
    edge_index: torch.Tensor,
    node_sequence: torch.Tensor,
    mapping: IndexMap,
    edge_weight: torch.Tensor | None = None,
    aggr: str = "src",
    save: bool = True,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, Graph | None]:
    """Lift order by one and save the result in the layers dictionary of the object.
    This is a helper function that should not be called directly.
    Only use for edge_indices after the special cases have been handled e.g.
    in the from_temporal_graph (filtering non-time-respecting paths of order 2).

    Args:
        edge_index: The edge index of the (k-1)-th order graph.
        node_sequence: The node sequences of the (k-1)-th order graph.
        edge_weight: The edge weights of the (k-1)-th order graph.
        k: The order of the graph that should be computed.
        aggr: The aggregation method to use. One of "src", "dst", "max", "mul".
        save: Whether to compute the aggregated graph and later save it in the layers dictionary.
    """
    # Lift order
    if edge_weight is None:
        ho_index = lift_order_edge_index(edge_index, num_nodes=node_sequence.size(0))
    else:
        ho_index, edge_weight = lift_order_edge_index_weighted(
            edge_index, edge_weight=edge_weight, num_nodes=node_sequence.size(0), aggr=aggr
        )
    node_sequence = torch.cat([node_sequence[edge_index[0]], node_sequence[edge_index[1]][:, -1:]], dim=1)

    # Aggregate
    if save:
        gk = aggregate_edge_index(ho_index, node_sequence, edge_weight)
        gk.mapping = IndexMap([tuple(mapping.to_ids(v.cpu())) for v in gk.data.node_sequence])
    else:
        gk = None
    return ho_index, node_sequence, edge_weight, gk

likelihood_ratio_test

Perform a likelihood ratio test to compare two models of different order.

Parameters:

Name Type Description Default
dag_graph torch_geometric.data.Data

The input DAG graph data.

required
max_order_null int

The maximum order of the null hypothesis model. Defaults to 0.

0
max_order int

The maximum order of the alternative hypothesis model. Defaults to 1.

1
assumption str

The assumption to use for the degrees of freedom calculation. Can be 'paths' or 'ngrams'. Defaults to 'paths'.

'paths'
significance_threshold float

The significance threshold for the test. Defaults to 0.01.

0.01

Returns:

Name Type Description
tuple tuple

A tuple containing a boolean indicating whether the null hypothesis is rejected and the p-value of the test.

Source code in src/pathpyG/core/multi_order_model.py
def likelihood_ratio_test(
    self,
    dag_graph: Data,
    max_order_null: int = 0,
    max_order: int = 1,
    assumption: str = "paths",
    significance_threshold: float = 0.01,
) -> tuple:
    """
    Perform a likelihood ratio test to compare two models of different order.

    Args:
        dag_graph (Data): The input DAG graph data.
        max_order_null (int, optional): The maximum order of the null hypothesis model.
            Defaults to 0.
        max_order (int, optional): The maximum order of the alternative hypothesis model.
            Defaults to 1.
        assumption (str, optional): The assumption to use for the degrees of freedom calculation.
            Can be 'paths' or 'ngrams'. Defaults to 'paths'.
        significance_threshold (float, optional): The significance threshold for the test.
            Defaults to 0.01.

    Returns:
        tuple: A tuple containing a boolean indicating whether the null hypothesis is rejected
            and the p-value of the test.
    """
    assert (
        max_order_null < max_order
    ), "Error: order of null hypothesis must be smaller than order of alternative hypothesis"
    assert max_order <= max(
        self.layers
    ), f"Error: order of hypotheses ({max_order_null} and {max_order}) must be smaller than the maximum order of the MultiOrderModel {max(self.layers)}"
    # let L0 be the likelihood for the null model and L1 be the likelihood for the alternative model

    # we first compute a test statistic x = -2 * log (L0/L1) = -2 * (log L0 - log L1)
    x = -2 * (
        self.get_mon_log_likelihood(dag_graph, max_order=max_order_null)
        - self.get_mon_log_likelihood(dag_graph, max_order=max_order)
    )

    # we calculate the additional degrees of freedom in the alternative model
    dof_diff = self.get_mon_dof(max_order, assumption=assumption) - self.get_mon_dof(
        max_order_null, assumption=assumption
    )

    # if the p-value is *below* the significance threshold, we reject the null hypothesis
    p = 1 - chi2.cdf(x, dof_diff)
    return (p < significance_threshold), p

to_dbgnn_data

Convert the MultiOrderModel to a De Bruijn graph for the given maximum order that can be used in pathpyG.nn.dbgnn.DBGNN.

Parameters:

Name Type Description Default
max_order int

The maximum order of the De Bruijn graph to be computed.

2
mapping str

The mapping to use for the bipartite edge index. One of "last", "first", or "both".

'last'

Returns:

Name Type Description
Data torch_geometric.data.Data

The De Bruijn graph data.

Source code in src/pathpyG/core/multi_order_model.py
def to_dbgnn_data(self, max_order: int = 2, mapping: str = "last") -> Data:
    """
    Convert the MultiOrderModel to a De Bruijn graph for the given maximum order
    that can be used in `pathpyG.nn.dbgnn.DBGNN`.

    Args:
        max_order: The maximum order of the De Bruijn graph to be computed.
        mapping: The mapping to use for the bipartite edge index. One of "last", "first", or "both".

    Returns:
        Data: The De Bruijn graph data.
    """
    if max_order not in self.layers:
        raise ValueError(f"Higher-order graph of order {max_order} not found.")

    g = self.layers[1]
    g_max_order = self.layers[max_order]
    num_nodes = g.data.num_nodes
    num_ho_nodes = g_max_order.data.num_nodes
    if g.data.x is not None:
        x = g.data.x
    else:
        x = torch.eye(num_nodes, num_nodes)
    x_max_order = torch.eye(num_ho_nodes, num_ho_nodes)
    edge_index = g.data.edge_index
    edge_index_max_order = g_max_order.data.edge_index
    edge_weight = g.data.edge_weight
    edge_weight_max_order = g_max_order.data.edge_weight
    bipartite_edge_index = generate_bipartite_edge_index(g, g_max_order, mapping=mapping)

    if g.data.y is not None:
        y = g.data.y

    return Data(
        num_nodes=num_nodes,
        num_ho_nodes=num_ho_nodes,
        x=x,
        x_h=x_max_order,
        edge_index=edge_index,
        edge_index_higher_order=edge_index_max_order,
        edge_weights=edge_weight.float(),
        edge_weights_higher_order=edge_weight_max_order.float(),
        bipartite_edge_index=bipartite_edge_index,
        y=y if "y" in locals() else None,
    )