Source code for pm4py.algo.querying.llm.abstractions.log_to_fea_descr

'''
    PM4Py – A Process Mining Library for Python
Copyright (C) 2024 Process Intelligence Solutions UG (haftungsbeschränkt)

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see this software project's root or
visit <https://www.gnu.org/licenses/>.

Website: https://processintelligence.solutions
Contact: info@processintelligence.solutions
'''

from typing import Optional, Dict, Any, Union
from pm4py.util import exec_utils, constants, pandas_utils
from pm4py.objects.log.obj import EventLog, EventStream
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.transformation.log_to_features import (
    algorithm as log_to_features,
)
from enum import Enum
import numpy as np



[docs]
class Parameters(Enum):
    INCLUDE_HEADER = "include_header"
    MAX_LEN = "max_len"



def __transform_to_string(stru: str) -> str:
    if stru == "@@max_concurrent_activities_general":
        return "Maximum Number of Concurrent Events"
    elif stru.startswith("@@max_concurrent_activities_like_"):
        return (
            "Maximum Number of Concurrent '"
            + stru.split("@@max_concurrent_activities_like_")[-1]
            + "'"
        )
    elif stru.startswith("event:"):
        stru = stru.split("event:")[-1]
        if "@" in stru:
            attr = stru.split("@")[0]
            value = stru.split("@")[-1]
            return "Value '" + value + "' for Event Attribute '" + attr + "'"
        else:
            return "Values for Event Attribute '" + stru + "'"
    elif stru.startswith("trace:"):
        stru = stru.split("trace:")[-1]
        if "@" in stru:
            attr = stru.split("@")[0]
            value = stru.split("@")[-1]
            return "Value '" + value + "' for Case Attribute '" + attr + "'"
        else:
            return "Values for Case Attribute '" + stru + "'"
    elif stru.startswith("succession:"):
        stru = stru.split("succession:")[-1]
        attr = stru.split("@")[0]
        stru = stru.split("@")[-1]
        val1 = stru.split("#")[0]
        val2 = stru.split("#")[-1]
        return (
            "Succession '"
            + val1
            + "' -> '"
            + val2
            + "' for the Values of the Attribute '"
            + attr
            + "'"
        )
    elif stru == "@@caseDuration":
        return "Case Duration"
    elif stru.startswith("firstIndexAct@@"):
        return (
            "First Position of the Activity '"
            + stru.split("@@")[-1]
            + "' in the Case"
        )
    elif stru.startswith("lastIndexAct@@"):
        return (
            "Last Position of the Activity '"
            + stru.split("@@")[-1]
            + "' in the Case"
        )
    elif stru.startswith("startToLastOcc@@"):
        return (
            "Time from Case Start to Last Occurrence of the Activity '"
            + stru.split("@@")[-1]
            + "'"
        )
    elif stru.startswith("lastOccToEnd@@"):
        return (
            "Time from Last Occurrence of the Activity '"
            + stru.split("@@")[-1]
            + "' to Case End"
        )
    elif stru.startswith("startToFirstOcc@@"):
        return (
            "Time from Case Start to First Occurrence of the Activity '"
            + stru.split("@@")[-1]
            + "'"
        )
    elif stru.startswith("firstOccToEnd@@"):
        return (
            "Time from First Occurrence of the Activity '"
            + stru.split("@@")[-1]
            + "' to Case End"
        )
    elif stru.startswith("directPathPerformanceLastOcc@@"):
        stru = stru.split("@@")[-1].split("##")
        return (
            "Directly-Follows Paths Throughput between '"
            + stru[0]
            + "' and '"
            + stru[1]
            + "' (last occurrence of the path in the case)"
        )
    elif stru.startswith("indirectPathPerformanceLastOcc@@"):
        stru = stru.split("@@")[-1].split("##")
        return (
            "Eventually-Follows Paths Throughput between '"
            + stru[0]
            + "' and '"
            + stru[1]
            + "' (last occurrence of the path in the case)"
        )
    elif stru.startswith("resource_workload@@"):
        return "Resource Workload of '" + stru.split("@@")[-1] + "'"
    elif stru == "@@work_in_progress":
        return "Work in Progress"

    return stru



[docs]
def textual_abstraction_from_fea_df(
    fea_df: pd.DataFrame, parameters: Optional[Dict[Any, Any]] = None
) -> str:
    """
    Returns the textual abstraction of ML features already encoded in a feature table

    Minimum viable example:

        import pm4py
        from pm4py.algo.querying.llm.abstractions import log_to_fea_descr

        log = pm4py.read_xes("tests/input_data/receipt.xes", return_legacy_log_object=True)
        fea_df = pm4py.extract_features_dataframe(log)
        text_abstr = log_to_fea_descr.textual_abstraction_from_fea_df(fea_df)
        print(text_abstr)

    Parameters
    ---------------
    fea_df
        Feature table (numeric features; stored as Pandas dataframe)
    parameters
        Parameters that should be provided to the feature extraction, plus:
        - Parameters.INCLUDE_HEADER => includes a descriptive header in the returned text
        - Parameters.MAX_LEN => maximum length of the provided text (if necessary, only the most meaningful features are kept)

    Returns
    ---------------
    stru
        Textual abstraction
    """
    if parameters is None:
        parameters = {}

    include_header = exec_utils.get_param_value(
        Parameters.INCLUDE_HEADER, parameters, True
    )
    max_len = exec_utils.get_param_value(
        Parameters.MAX_LEN, parameters, constants.OPENAI_MAX_LEN
    )

    cols = []

    for c in fea_df.columns:
        ser = fea_df[c]
        ser1 = ser[ser > 0]
        if len(ser1) > 0:
            desc = __transform_to_string(c)
            avg = np.average(ser1)
            stdavg = 0 if avg == 0 or len(ser1) == 1 else np.std(ser1) / avg
            cols.append([desc, len(ser1), stdavg, ser1])

    cols = sorted(cols, key=lambda x: (x[1], x[2], x[0]), reverse=True)

    ret = ["\n"]

    if include_header:
        ret.append("Given the following features:\n\n")

    ret = " ".join(ret)

    i = 0
    while i < len(cols):
        if len(ret) >= max_len:
            break

        fea_name = cols[i][0]
        fea_col = cols[i][3]

        stru = (
            fea_name
            + ":    number of non-zero values: "
            + str(cols[i][1])
            + " ; quantiles of the non-zero: "
            + str(fea_col.quantile([0.0, 0.25, 0.5, 0.75, 1.0]).to_dict())
            + "\n"
        )

        ret = ret + stru

        i = i + 1

    return ret




[docs]
def apply(
    log: Union[EventLog, EventStream, pd.DataFrame],
    parameters: Optional[Dict[Any, Any]] = None,
) -> str:
    """
    Returns the textual abstraction of ML features extracted from a traditional event log object.

    Minimum viable example:

        import pm4py
        from pm4py.algo.querying.llm.abstractions import log_to_fea_descr

        log = pm4py.read_xes("tests/input_data/receipt.xes", return_legacy_log_object=True)
        text_abstr = log_to_fea_descr.apply(log)
        print(text_abstr)

    Parameters
    ---------------
    log
        Event log / Pandas dataframe
    parameters
        Parameters that should be provided to the feature extraction, plus:
        - Parameters.INCLUDE_HEADER => includes a descriptive header in the returned text
        - Parameters.MAX_LEN => maximum length of the provided text (if necessary, only the most meaningful features are kept)

    Returns
    ---------------
    stru
        Textual abstraction
    """
    if parameters is None:
        parameters = {}

    log = log_converter.apply(
        log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters
    )

    data, feature_names = log_to_features.apply(log, parameters=parameters)

    fea_df = pandas_utils.instantiate_dataframe(data, columns=feature_names)

    return textual_abstraction_from_fea_df(fea_df, parameters=parameters)