Source code for pm4py.algo.discovery.ocel.link_analysis.variants.classic

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
from enum import Enum

from pm4py.util import exec_utils, constants, xes_constants, pandas_utils
from typing import Optional, Dict, Any, Set
import pandas as pd


[docs]class Parameters(Enum):
    OUT_COLUMN = "out_column"
    IN_COLUMN = "in_column"
    SORTING_COLUMN = "sorting_column"
    INDEX_COLUMN = "index_column"
    LOOK_FORWARD = "look_forward"
    KEEP_FIRST_OCCURRENCE = "keep_first_occurrence"
    PROPAGATE = "propagate"


[docs]def propagate_associations(associations: Dict[str, Set[str]]) -> Dict[str, Set[str]]:
    """
    Propagate the associations, such that the eventually-follows
    flow between the events of the event log is considered

    Parameters
    -------------------
    associations
        Associations between events

    Returns
    ------------------
    propagated_associations
        Propagated associations
    """
    reverse_dict = {}
    for x in associations:
        for k in associations[x]:
            if k not in reverse_dict:
                reverse_dict[k] = set()
            reverse_dict[k].add(x)
    change_dict = {x: True for x in associations}
    to_change = [x for x in change_dict if change_dict[x]]
    while to_change:
        for x in to_change:
            change_dict[x] = False
        for x in to_change:
            if x in reverse_dict:
                rv = reverse_dict[x]
                for k in rv:
                    new_set = associations[k].union(associations[x])
                    if len(new_set) > len(associations[k]):
                        change_dict[k] = True
                        associations[k] = new_set
        to_change = [x for x in change_dict if change_dict[x]]
    return associations


[docs]def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> pd.DataFrame:
    """
    Performs a link analysis between the entries of the current dataframe.
    The link analysis permits advanced filtering based on events connected in an
    output-input relation (e.g., the OUT column of the first is equal to the IN column
    of the second).

    When OUT_COLUMN = IN_COLUMN = CASE ID, it can be equivalent to the directly-follows graph
    (when Parameters.KEEP_FIRST_OCCURRENCE = True), and to the eventually-follows graph
    (when Parameters.KEEP_FIRST_OCCURRENCE = False).

    Parameters
    -----------------
    dataframe
        Pandas dataframe
    parameters
        Parameters of the algorithm, including:
        - Parameters.OUT_COLUMN => the output column of the dataframe
        - Parameters.IN_COLUMN => the input column of the dataframe
        - Parameters.SORTING_COLUMN => the column on top of which the
        - Parameters.INDEX_COLUMN => the attribute to use for the indexing
        - Parameters.LOOK_FORWARD => filters the relations in which the second event has an index >= than the index
        of the first event.
        - Parameters.KEEP_FIRST_OCCURRENCE => keep, for every source event, only the first-occurring relationship
        with a target event (OUT=IN).
        - Parameters.PROPAGATE => propagate the relationships between events, in such a way that the entire document
        flow chain can be reconstructed.

    Returns
    -----------------
    link_analysis_dataframe
        Link analysis dataframe
    """
    if parameters is None:
        parameters = {}

    out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
    in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME)
    sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters,
                                                xes_constants.DEFAULT_TIMESTAMP_KEY)
    index_column = exec_utils.get_param_value(Parameters.INDEX_COLUMN, parameters, constants.DEFAULT_INDEX_KEY)
    look_forward = exec_utils.get_param_value(Parameters.LOOK_FORWARD, parameters, True)
    keep_first_occurrence = exec_utils.get_param_value(Parameters.KEEP_FIRST_OCCURRENCE, parameters, False)
    propagate = exec_utils.get_param_value(Parameters.PROPAGATE, parameters, False)

    dataframe = dataframe.sort_values(sorting_column)
    dataframe = pandas_utils.insert_index(dataframe, index_column)

    df_red1 = dataframe[[out_column, index_column]]
    df_red2 = dataframe[[in_column, index_column]]
    df_red = df_red1.merge(df_red2, left_on=out_column, right_on=in_column, suffixes=("_out", "_in"))

    if look_forward:
        df_red = df_red[df_red[index_column + "_out"] < df_red[index_column + "_in"]]

    if keep_first_occurrence:
        df_red = df_red.groupby(index_column + "_out").first().reset_index()

    stream_red = df_red.to_dict("records")
    associations = {}
    for el in stream_red:
        if not el[index_column + "_out"] in associations:
            associations[el[index_column + "_out"]] = set()
        associations[el[index_column + "_out"]].add(el[index_column + "_in"])

    if propagate:
        associations = propagate_associations(associations)

    out_clmn = []
    in_clmn = []
    for k in associations:
        for v in associations[k]:
            out_clmn.append(k)
            in_clmn.append(v)

    rel = pd.DataFrame({index_column + "_out": out_clmn, index_column + "_in": in_clmn})

    df_link = dataframe.copy()
    df_link.columns = [x + "_out" for x in df_link.columns]
    df_link = df_link.merge(rel, left_on=index_column + "_out", right_on=index_column + "_out")
    dataframe.columns = [x + "_in" for x in dataframe.columns]
    df_link = df_link.merge(dataframe, left_on=index_column + "_in", right_on=index_column + "_in")

    return df_link