Source code for pm4py.algo.querying.llm.abstractions.log_to_fea_descr

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''

from typing import Optional, Dict, Any, Union
from pm4py.util import exec_utils, constants, pandas_utils
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.transformation.log_to_features import (
    algorithm as log_to_features,
)
from enum import Enum
import numpy as np


[docs] class Parameters(Enum): INCLUDE_HEADER = "include_header" MAX_LEN = "max_len"
def __transform_to_string(stru: str) -> str: if stru == "@@max_concurrent_activities_general": return "Maximum Number of Concurrent Events" elif stru.startswith("@@max_concurrent_activities_like_"): return ( "Maximum Number of Concurrent '" + stru.split("@@max_concurrent_activities_like_")[-1] + "'" ) elif stru.startswith("event:"): stru = stru.split("event:")[-1] if "@" in stru: attr = stru.split("@")[0] value = stru.split("@")[-1] return "Value '" + value + "' for Event Attribute '" + attr + "'" else: return "Values for Event Attribute '" + stru + "'" elif stru.startswith("trace:"): stru = stru.split("trace:")[-1] if "@" in stru: attr = stru.split("@")[0] value = stru.split("@")[-1] return "Value '" + value + "' for Case Attribute '" + attr + "'" else: return "Values for Case Attribute '" + stru + "'" elif stru.startswith("succession:"): stru = stru.split("succession:")[-1] attr = stru.split("@")[0] stru = stru.split("@")[-1] val1 = stru.split("#")[0] val2 = stru.split("#")[-1] return ( "Succession '" + val1 + "' -> '" + val2 + "' for the Values of the Attribute '" + attr + "'" ) elif stru == "@@caseDuration": return "Case Duration" elif stru.startswith("firstIndexAct@@"): return ( "First Position of the Activity '" + stru.split("@@")[-1] + "' in the Case" ) elif stru.startswith("lastIndexAct@@"): return ( "Last Position of the Activity '" + stru.split("@@")[-1] + "' in the Case" ) elif stru.startswith("startToLastOcc@@"): return ( "Time from Case Start to Last Occurrence of the Activity '" + stru.split("@@")[-1] + "'" ) elif stru.startswith("lastOccToEnd@@"): return ( "Time from Last Occurrence of the Activity '" + stru.split("@@")[-1] + "' to Case End" ) elif stru.startswith("startToFirstOcc@@"): return ( "Time from Case Start to First Occurrence of the Activity '" + stru.split("@@")[-1] + "'" ) elif stru.startswith("firstOccToEnd@@"): return ( "Time from First Occurrence of the Activity '" + stru.split("@@")[-1] + "' to Case End" ) elif stru.startswith("directPathPerformanceLastOcc@@"): stru = stru.split("@@")[-1].split("##") return ( "Directly-Follows Paths Throughput between '" + stru[0] + "' and '" + stru[1] + "' (last occurrence of the path in the case)" ) elif stru.startswith("indirectPathPerformanceLastOcc@@"): stru = stru.split("@@")[-1].split("##") return ( "Eventually-Follows Paths Throughput between '" + stru[0] + "' and '" + stru[1] + "' (last occurrence of the path in the case)" ) elif stru.startswith("resource_workload@@"): return "Resource Workload of '" + stru.split("@@")[-1] + "'" elif stru == "@@work_in_progress": return "Work in Progress" return stru
[docs] def textual_abstraction_from_fea_df( fea_df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None ) -> str: """ Returns the textual abstraction of ML features already encoded in a feature table Minimum viable example: import pm4py from pm4py.algo.querying.llm.abstractions import log_to_fea_descr log = pm4py.read_xes("tests/input_data/receipt.xes", return_legacy_log_object=True) fea_df = pm4py.extract_features_dataframe(log) text_abstr = log_to_fea_descr.textual_abstraction_from_fea_df(fea_df) print(text_abstr) Parameters --------------- fea_df Feature table (numeric features; stored as Pandas dataframe) parameters Parameters that should be provided to the feature extraction, plus: - Parameters.INCLUDE_HEADER => includes a descriptive header in the returned text - Parameters.MAX_LEN => maximum length of the provided text (if necessary, only the most meaningful features are kept) Returns --------------- stru Textual abstraction """ if parameters is None: parameters = {} include_header = exec_utils.get_param_value( Parameters.INCLUDE_HEADER, parameters, True ) max_len = exec_utils.get_param_value( Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN ) cols = [] for c in fea_df.columns: ser = fea_df[c] ser1 = ser[ser > 0] if len(ser1) > 0: desc = __transform_to_string(c) avg = np.average(ser1) stdavg = 0 if avg == 0 or len(ser1) == 1 else np.std(ser1) / avg cols.append([desc, len(ser1), stdavg, ser1]) cols = sorted(cols, key=lambda x: (x[1], x[2], x[0]), reverse=True) ret = ["\n"] if include_header: ret.append("Given the following features:\n\n") ret = " ".join(ret) i = 0 while i < len(cols): if len(ret) >= max_len: break fea_name = cols[i][0] fea_col = cols[i][3] stru = ( fea_name + ": number of non-zero values: " + str(cols[i][1]) + " ; quantiles of the non-zero: " + str(fea_col.quantile([0.0, 0.25, 0.5, 0.75, 1.0]).to_dict()) + "\n" ) ret = ret + stru i = i + 1 return ret
[docs] def apply( log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[Dict[Any, Any]] = None, ) -> str: """ Returns the textual abstraction of ML features extracted from a traditional event log object. Minimum viable example: import pm4py from pm4py.algo.querying.llm.abstractions import log_to_fea_descr log = pm4py.read_xes("tests/input_data/receipt.xes", return_legacy_log_object=True) text_abstr = log_to_fea_descr.apply(log) print(text_abstr) Parameters --------------- log Event log / Pandas dataframe parameters Parameters that should be provided to the feature extraction, plus: - Parameters.INCLUDE_HEADER => includes a descriptive header in the returned text - Parameters.MAX_LEN => maximum length of the provided text (if necessary, only the most meaningful features are kept) Returns --------------- stru Textual abstraction """ if parameters is None: parameters = {} log = log_converter.apply( log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters ) data, feature_names = log_to_features.apply(log, parameters=parameters) fea_df = pandas_utils.instantiate_dataframe(data, columns=feature_names) return textual_abstraction_from_fea_df(fea_df, parameters=parameters)