Source code for pm4py.algo.organizational_mining.network_analysis.variants.dataframe

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
from enum import Enum
from pm4py.util import exec_utils
from pm4py.util import xes_constants, constants, pandas_utils
import pandas as pd
from typing import Dict, Optional, Any, Tuple
from pm4py.util.business_hours import soj_time_business_hours_diff
from pm4py.algo.discovery.ocel.link_analysis.variants import classic as link_analysis


[docs]class Parameters(Enum): SORTING_COLUMN = "sorting_column" INDEX_KEY = "index_key" TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY IN_COLUMN = "in_column" OUT_COLUMN = "out_column" NODE_COLUMN_SOURCE = "node_column_source" NODE_COLUMN_TARGET = "node_column_target" EDGE_COLUMN = "edge_column" INCLUDE_PERFORMANCE = "include_performance" BUSINESS_HOURS = "business_hours" WORKTIMING = "worktiming" WEEKENDS = "weekends" WORKCALENDAR = "workcalendar" TIMESTAMP_DIFF_COLUMN = "timestamp_diff_column" EDGE_REFERENCE = "edge_reference"
[docs]def apply(dataframe: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None) -> Dict[ Tuple[str, str], Dict[str, Any]]: """ Performs the network analysis on the provided dataframe Parameters ----------------- dataframe Dataframe parameters Parameters of the method, including: - Parameters.SORTING_COLUMN => the column that should be used to sort the log - Parameters.IN_COLUMN => the target column of the link (default: the case identifier; events of the same case are linked) - Parameters.OUT_COLUMN => the source column of the link (default: the case identifier; events of the same case are linked) - Parameters.INDEX_KEY => the name for the index attribute in the log (inserted during the execution) - Parameters.NODE_COLUMN_SOURCE => the attribute to be used for the node definition of the source event (default: the resource of the log, org:resource) - Parameters.NODE_COLUMN_TARGET => the attribute to be used for the node definition of the target event (default: the resource of the log, org:resource) - Parameters.EDGE_COLUMN => the attribute to be used for the edge definition (default: the activity of the log, concept:name) - Parameters.EDGE_REFERENCE => the event into which the edge attribute should be picked: - _out => the source event - _in => the target event - Parameters.TIMESTAMP_COLUMN => the timestamp column - Parameters.TIMESTAMP_DIFF_COLUMN => timestamp diff column - Parameters.INCLUDE_PERFORMANCE => considers the performance of the edge - Parameters.BUSINESS_HOURS => boolean value that enables the business hours - Parameters.WORKTIMING => defines the worktiming of the organization (e.g. [7, 17]) if business hours are enabled - Parameters.WEEKENDS => defines the weekends of the organization (e.g. [6, 7]) if business hours are enabled Returns ----------------- network_analysis Edges of the network analysis (first key: edge; second key: type; value: number of occurrences) """ if parameters is None: parameters = {} sorting_column = exec_utils.get_param_value(Parameters.SORTING_COLUMN, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) timestamp_column = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) in_column = exec_utils.get_param_value(Parameters.IN_COLUMN, parameters, constants.CASE_CONCEPT_NAME) out_column = exec_utils.get_param_value(Parameters.OUT_COLUMN, parameters, constants.CASE_CONCEPT_NAME) node_column_source = exec_utils.get_param_value(Parameters.NODE_COLUMN_SOURCE, parameters, xes_constants.DEFAULT_RESOURCE_KEY) node_column_target = exec_utils.get_param_value(Parameters.NODE_COLUMN_TARGET, parameters, xes_constants.DEFAULT_RESOURCE_KEY) edge_column = exec_utils.get_param_value(Parameters.EDGE_COLUMN, parameters, xes_constants.DEFAULT_NAME_KEY) dataframe = dataframe[{timestamp_column, in_column, out_column, node_column_source, node_column_target, edge_column, sorting_column}] parameters_la = {link_analysis.Parameters.OUT_COLUMN: out_column, link_analysis.Parameters.IN_COLUMN: in_column, link_analysis.Parameters.INDEX_COLUMN: index_key, link_analysis.Parameters.SORTING_COLUMN: sorting_column, link_analysis.Parameters.LOOK_FORWARD: True, link_analysis.Parameters.KEEP_FIRST_OCCURRENCE: True, link_analysis.Parameters.PROPAGATE: False} merged_df = link_analysis.apply(dataframe, parameters=parameters_la) return build_network_analysis_from_link_analysis(merged_df, parameters=parameters)