Source code for pm4py.algo.transformation.log_to_features.variants.event_based
'''
This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).
PM4Py is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
PM4Py is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
'''
from collections import Counter
from enum import Enum
from typing import Optional, List, Dict, Any, Tuple, Union
import numpy as np
from pm4py.objects.log.obj import EventLog
from pm4py.util import exec_utils
from pm4py.objects.conversion.log import converter
[docs]class Parameters(Enum):
STR_EVENT_ATTRIBUTES = "str_ev_attr"
NUM_EVENT_ATTRIBUTES = "num_ev_attr"
FEATURE_NAMES = "feature_names"
MIN_NUM_DIFF_STR_VALUES = "min_num_diff_str_values"
MAX_NUM_DIFF_STR_VALUES = "max_num_diff_str_values"
[docs]def extract_all_ev_features_names_from_log(log: EventLog, str_ev_attr: List[str], num_ev_attr: List[str],
parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> List[str]:
"""
Extracts the feature names from an event log.
Parameters
---------------
log
Event log
str_ev_attr
(if provided) list of string event attributes to consider in extracting the feature names
num_ev_attr
(if provided) list of integer event attributes to consider in extracting the feature names
parameters
Parameters, including:
- MIN_NUM_DIFF_STR_VALUES => minimum number of distinct values to include an attribute as feature(s)
- MAX_NUM_DIFF_STR_VALUES => maximum number of distinct values to include an attribute as feature(s)
Returns
----------------
feature_names
List of feature names
"""
if parameters is None:
parameters = {}
min_num_diff_str_values = exec_utils.get_param_value(Parameters.MIN_NUM_DIFF_STR_VALUES, parameters, 2)
max_num_diff_str_values = exec_utils.get_param_value(Parameters.MAX_NUM_DIFF_STR_VALUES, parameters, 500)
str_features = {}
num_features = Counter()
count_events = 0
for trace in log:
for event in trace:
count_events += 1
for attr_name in event:
attr_value = event[attr_name]
if isinstance(attr_value, str) and (str_ev_attr is None or attr_name in str_ev_attr):
if attr_name not in str_features:
str_features[attr_name] = set()
str_features[attr_name].add("event:" + attr_name + "@" + attr_value)
elif isinstance(attr_value, int) or isinstance(attr_value, float):
if num_ev_attr is None or attr_name in num_ev_attr:
num_features["event:" + attr_name] += 1
num_features = list({x for x, y in num_features.items() if y == count_events})
str_features = list({z for x, y in str_features.items() for z in y if
min_num_diff_str_values <= len(y) <= max_num_diff_str_values})
feature_names = str_features + num_features
feature_names = sorted(feature_names)
return feature_names
[docs]def extract_features(log: EventLog, feature_names: List[str], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Tuple[
Any, List[str]]:
"""
Extracts the matrix of the features from an event log
Parameters
---------------
log
Event log
feature_names
Features to consider (in the given order)
Returns
-------------
data
Data to provide for decision tree learning
feature_names
Names of the features, in order
"""
v1 = max(len(trace) for trace in log)
v2 = len(feature_names)
data = np.zeros((len(log), v1, v2), dtype=np.float32)
for i1, trace in enumerate(log):
for i2, event in enumerate(trace):
str_features = set()
num_features = {}
for attr_name in event:
attr_value = event[attr_name]
if isinstance(attr_value, str):
str_features.add("event:" + attr_name + "@" + attr_value)
elif isinstance(attr_value, int) or isinstance(attr_value, float):
num_features["event:" + attr_name] = attr_value
for attr in str_features:
if attr in feature_names:
data[i1, i2, feature_names.index(attr)] = 1
for attr in num_features:
if attr in feature_names:
data[i1, i2, feature_names.index(attr)] = num_features[attr]
return data, feature_names
[docs]def apply(log: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Tuple[Any, List[str]]:
"""
Extracts all the features for the traces of an event log (each trace becomes a vector of vectors, where each
event has its own vector)
Parameters
-----------------
log
Event log
parameters
Parameters of the algorithm, including:
- STR_EVENT_ATTRIBUTES => string event attributes to consider in the features extraction
- NUM_EVENT_ATTRIBUTES => numeric event attributes to consider in the features extraction
- FEATURE_NAMES => features to consider (in the given order)
Returns
-------------
data
Data to provide for decision tree learning
feature_names
Names of the features, in order
"""
if parameters is None:
parameters = {}
str_ev_attr = exec_utils.get_param_value(Parameters.STR_EVENT_ATTRIBUTES, parameters, None)
num_ev_attr = exec_utils.get_param_value(Parameters.NUM_EVENT_ATTRIBUTES, parameters, None)
feature_names = exec_utils.get_param_value(Parameters.FEATURE_NAMES, parameters, None)
log = converter.apply(log, variant=converter.Variants.TO_EVENT_LOG, parameters=parameters)
if feature_names is None:
feature_names = extract_all_ev_features_names_from_log(log, str_ev_attr, num_ev_attr, parameters=parameters)
return extract_features(log, feature_names, parameters=parameters)