GUI_AI/Learn.py at main · MeoPBK/GUI_AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Import necessary libraries
# import numpy as np
from sentence_transformers import SentenceTransformer, util
import simpful as sf

class Fuzz:
    def __init__(self, PROTOTYPE_SENTENCES: list = None, fuzz_trigger: bool = False, sentence_categorization: bool = True ):
        self.prototype_sentences = PROTOTYPE_SENTENCES

    ## Transformer for word classification
        # SEMANTIC SIMILARITIES SETUP
        self.model = SentenceTransformer('distiluse-base-multilingual-cased-v2') # all-MiniLM-L6-v2')    # Load a pre-trained Sentence Transformer model. This model transforms text into high-dimensional embeddings that capture contextual meaning.
        if sentence_categorization:
            self.categories = ["web", "time-related", "financial", "scientific", "mathematics", "article", "greetings", "other"]
            self.category_embeddings = self.model.encode(self.categories, normalize_embeddings=True)
        else:
            self.top_k = 5
            self.context_embeddings = []
            #self.model = SentenceTransformer('all-MiniLM-L6-v2') # msmarco-distilbert-base-v4 # parahrase-MiniLM-L6-v2 ## 22M, all-mpnet-base-v2, paraphrase-mpnet-base-v2 ## 110M, distiluse-base-multilingual-cased-v2 ## 134M

## TRANSFORMERS ##
    # Transformer used for the classification of words
    def transformer_classify_word(self, word):
        word_embedding = self.model.encode([word], normalize_embeddings=True)
        similarity_scores = util.pytorch_cos_sim(word_embedding, self.category_embeddings)[0]

        best_category_index = similarity_scores.argmax().item()
        best_category = self.categories[best_category_index]
        best_confidence = similarity_scores[best_category_index].item()
        web_confidence = similarity_scores[0].item()
        time_related_confidence = similarity_scores[1].item()
        print("wc: " + str(web_confidence) + ", bc: " + str(best_confidence) + " " + str(best_category))
        return web_confidence, time_related_confidence, best_category, best_confidence

    def transformer_context_filter(self,query: str, context: list , buffer_context_transformer: int, context_threshold: float)->list:
        # Tokenize query into words
        if len(context) <= buffer_context_transformer:
            return query
        else:
            self.context_embeddings = self.model.encode(context, convert_to_tensor=True)
            words = query.split()  # Simple split, can use NLP tokenizer if needed
            relevant_results = {}
            relevant_words = []
            for word in words:
                # Encode the single word
                word_embedding = self.model.encode(word, convert_to_tensor=True)
                # Compute cosine similarity between word and stored contexts
                similarity_scores = util.cos_sim(word_embedding, self.context_embeddings)[0].cpu().numpy()
                # Pair each context with its similarity score and sort
                sorted_contexts = sorted(zip(context, similarity_scores), key=lambda x: x[1], reverse=True)
                # Filter by threshold and limit to top-k
                relevant_contexts = [(ctx, score) for ctx, score in sorted_contexts if score >= context_threshold][:self.top_k]
                # Store results in dictionary
                relevant_results[word] = relevant_contexts if relevant_contexts else [("", 0.0)]
                if  relevant_contexts:
                    print("pennuto")
                    if word not in relevant_words:
                        relevant_words.append(word)
            relevant_words = " ".join(relevant_words)
            #print(relevant_results)
            # for item in relevant_results.items():
            #     print("sergio")
            #     print(item)
                #relevant_words = " ".join(item)

            #relevant_words = " ".join(dir(relevant_results))
            print(relevant_words)
            return relevant_words

## FUZZY LOGIC SYSTEM SETUP ##              - for triggering confidence
        self.fuzz_trigger = fuzz_trigger
        if self.fuzz_trigger:
            self.FS = sf.FuzzySystem()
            # Define fuzzy rules to map similarity values to trigger confidence.
            rules = [
                "IF ((web_similarity IS low) AND (time_similarity IS low)) THEN (trigger IS low)",
                "IF ",
                "IF ((web_similarity IS medium) OR (time_similarity IS medium))  THEN (trigger IS medium)",
                "IF ((web_similarity IS high) OR (time_similarity IS high)) THEN (trigger IS high)"
            ]
            # NOTE:The final function can be written as a look-up table
            self.FS.add_rules(rules)

    def fuzziSettings(self, similarity_score: float)->float:
        try:
            # Add triangular membership functions for "similarity": low, medium, and high.
            ## NOTE: open question: how should i set the gaussian distribution
            S_1 = sf.FuzzySet(function=sf.Triangular_MF(a=0.0,b= 0.0, c=0.5), term="not similar")
            S_2 = sf.FuzzySet(function=sf.Triangular_MF(a=0.3, b=0.5, c=0.7), term="similar")
            S_3 = sf.FuzzySet(function=sf.Triangular_MF(a=0.5, b=1.0, c=1.0), term="very similar") # .Gaussian_MF(mu =1, sigma=0.5)
            # Set up the fuzzy input variable "similarity" on the domain [0, 1].
            self.FS.add_linguistic_variable("similarity", sf.LinguisticVariable([S_1,S_2,S_3],concept="Words similarity",universe_of_discourse = [0, 1]))

            # Add triangular membership functions for "trigger": low, medium, and high.
            T_1 = sf.FuzzySet(function=sf.Triangular_MF(a=0.0, b=0.0, c=0.5), term="low")
            T_2 = sf.FuzzySet(function=sf.Triangular_MF(a=0.3, b=0.5, c=0.7), term="medium")
            T_3 = sf.FuzzySet(function=sf.Triangular_MF(a=0.5, b=1.0, c=1.0), term="high")

            # Set up the fuzzy output variable "trigger" on the domain [0, 1].
            self.FS.add_linguistic_variable("trigger", sf.LinguisticVariable([T_1,T_2,T_3],concept="Trigger confidence",universe_of_discourse = [0, 1]))
        except:
            return ("ERROR")

    ## IDEA:
    # Computes the cosine similarity between a user's input and the prototype centroid.
    # def compute_similarity(self, user_input: str)->float:
    #     # Compute the embedding for the user input.
    #     input_embedding = self.model.encode(user_input, convert_to_tensor=True)
    #     # Calculate cosine similarity between the input embedding and the prototype centroid.
    #     similarity = util.cos_sim(input_embedding, self.prototype())
    #     # Convert the tensor output to a Python float.
    #     similarity_score = similarity.item()
    #     return similarity_score     # (float): A value between 0 and 1 representing the semantic similarity.

    # Computes a fuzzy logic–based confidence value (trigger) using Simpful based on the semantic similarity score.
    def defuzzify_output(self, similarity_score: float)->float:
        # Set the "similarity" variable in the fuzzy system.
        #self.FS.set_variable("similarity", similarity_score)
        # Execute the fuzzy inference process.
        tmp = self.FS.inference()
        print(self.FS.inference())
        trigger_value = tmp['trigger']
        # Retrieve and return the fuzzy output for "trigger".
        # trigger_value = self.FS.linguistic_variable["trigger"]      # trigger_value (float): A fuzzy output value between 0 and 1 representing trigger confidence.
        return trigger_value