VideoRepair/utils_client.py at main · daeunni/VideoRepair · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import configparser, time, re
from openai import AzureOpenAI

config = configparser.ConfigParser()
config.read('config.ini')

# Set up your own OpenAI API
client = AzureOpenAI(
            azure_endpoint = config.get("openai", "azure_endpoint"),
            api_key= config.get("openai", "api_key"),
            api_version= config.get("openai", "api_version"),
            )


def asking_gpt4o(system_prompt, task_prompt, gpt4_input_image) :
    response = client.chat.completions.create(
                                model="gpt-4o",           # "gpt-4o-new"
                                messages=[
                                    {"role": "system", "content": system_prompt},
                                    {
                                        "role": "user",
                                        "content": [
                                            {"type": "text", "text":  task_prompt},
                                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{gpt4_input_image}"}},
                                        ],
                                    }
                                ],
                                max_tokens=100,
                            )
    answer = response.choices[0].message.content
    return answer


def filter_DSG_answer_w_dependency(dsg_answers, qid2dependency) :
    qid2scores = {} ; qid2validity = {}

    for idx, qa in enumerate(dsg_answers) :
        qid2scores[str(idx+1)] = qa['A']            # e.g., {'1': 0.0, '2': 0.0, '3': 1.0, '4': 1.0}

    # consider dependency -> modify dsg_answers
    for id, parent_ids in qid2dependency.items() :
        any_parent_answered_no = False

        for parent_id in parent_ids:
            if parent_id == 0:
                continue
            if qid2scores[str(parent_id)] == 0:
                any_parent_answered_no = True
                break

        if any_parent_answered_no :
            qid2scores[id] = 0.0
            try :
                dsg_answers[int(id)-1]['A'] = 0.0
            except :
                continue
            qid2validity[id] = False
        else :
            qid2validity[id] = True

    return qid2scores, qid2validity, dsg_answers


def ask_gpt4o_DSG_and_grounding_wo_vprompt(gpt4_input_image, qid2question, init_prompt) :
    dsg_answers_with_area = []
    for i in range(len(qid2question)) :
        cur_question = qid2question[str(i+1)]
        system_prompt = f'You are an expert at answering questions about the content of a given image.'

        task_prompt = f'1. Given the question: "{cur_question}", provide a brief reasoning (up to two sentences) to determine an accurate answer. \
                        2. Respond using binary values: 1.0 for Yes and 0.0 for No. If the answer is uncertain due to image distortion or other issues, respond with 0.0 (No). \
                        Return the result as a dictionary in the following format (not in JSON format): \
                        {{"Q": "<question>", "reasoning": "<brief reasoning>", "A": <binary answer>}} \
                        (e.g., {{"Q": "Is there one robot?", "reasoning": "There are two visible robots in the image. To guarantee a Yes answer, one robot should be removed.", "A": 0.0}}) \
                        Provide only the dictionary as the output, without any additional text or explanations.'

        success = False ; error_count = 0
        while not success:
            try :
                response = client.chat.completions.create(
                                                model="gpt-4o",
                                                messages=[
                                                    {"role": "system", "content": system_prompt},
                                                   {
                                                        "role": "user",
                                                        "content": [
                                                            {"type": "text", "text":  task_prompt},
                                                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{gpt4_input_image}"}},
                                                        ],
                                                    }
                                                ],
                                                max_tokens=100,
                                            )

                answer = response.choices[0].message.content
                print(answer)
                print('*' * 5)
                success = True

            except :
                print('ERROR..')
                time.sleep(9)
                error_count += 1
                if error_count > 3 :
                    return
        try :
            answer = answer.replace('"', '\\"')
            answer_dict = eval(answer.replace('\n', '').replace('```json', '').replace('```', '').replace('\\', '').replace('```python', ''))
        except :
            return
        dsg_answers_with_area.append(answer_dict)
    return dsg_answers_with_area


def video_evaluation(qid2question, first_frame_img_gpt, Q_type, key_objects_in_questions) :
    dsg_answers = []
    for i in range(len(qid2question)) :
        cur_question = qid2question[str(i+1)]
        cur_question_type = Q_type[i]
        key_objects = key_objects_in_questions[i]

        system_prompt = f'You are an expert at answering questions about the content of a given image.'

        # Devide count prompt & non-count prompt
        count_prompt = f'''
                        1. Given the question: "{cur_question}", provide a brief reasoning (up to two sentences) to determine the accurate answer.
                        2. Respond to the question using binary values: 1.0 for "Yes" and 0.0 for "No". If the answer is uncertain or unnatural due to image distortion or other issues, respond with 0.0 ("No").
                        3. Return the number of "{key_objects}" (as an integer) mentioned in the initial prompt "{cur_question}".
                        4. Return the number of "{key_objects}" (as an integer) in the provided image.

                        Return the result as a dictionary in the following format (not in JSON format):
                        {{
                            "Q": "<question>",
                            "A": <binary answer>,
                            "reasoning": "<brief reasoning>",
                            "obj_in_prompt": <number of key object mentioned in the initial prompt>,
                            "obj_in_img": <number of key object in the image>,
                        }}

                        Example:
                        {{
                            "Q": "Is there one robot?",
                            "A": 0.0,
                            "reasoning": "There are two visible robots in the image.",
                            "obj_in_prompt": 1,
                            "obj_in_img": 2,
                        }}

                        Please provide only the dictionary as the output without any additional text or explanation.
                        '''

        non_count_prompt = f'''
                            Respond to "{cur_question}" using binary values: 1.0 for Yes and 0.0 for No. If the answer is uncertain due to image distortion or other issues, respond with 0.0 (No). \
                            Return the result as a dictionary in the following format (not in JSON format): \
                            {{"Q": "<question>", "A": <binary answer>}} \
                            (e.g., {{"Q": "Is there one robot?", "A": 0.0}}) \
                            Provide only the dictionary as the output, without any additional text or explanations.
                            '''

        success = False
        while not success:
            try :
                if cur_question_type == 'other' :
                    answer = asking_gpt4o(system_prompt, count_prompt, first_frame_img_gpt)
                else :
                    answer = asking_gpt4o(system_prompt, non_count_prompt, first_frame_img_gpt)

                print(answer) ; print('*' * 5)
                success = True

            except :
                print('ERROR..')
                time.sleep(9)

        try :
            answer = answer.replace('"', '\\"')
            answer_dict = eval(answer.replace('\n', '').replace('```json', '').replace('```', '').replace('\\', '').replace('```python', ''))
        except :
            continue

        dsg_answers.append(answer_dict)

    try :
        qid2scores, qid2validity, dsg_answers = filter_DSG_answer_w_dependency(dsg_answers, qid2dependency)
        print('Updated DSG score: ', qid2scores)
        print('Updated logs: ', qid2validity)
    except :
        dsg_answers = dsg_answers         # error -> not consider dependency

    return dsg_answers


def keep_object_selection(key_objects_from_Q, dsg_answers, first_frame_img_gpt) :
    preserve_object = None

    # Object-wise question collection
    object_wise_dict = {}
    for obj in key_objects_from_Q :
        cur_obj_qas = []
        for cur_qa in dsg_answers:
            if (obj in cur_qa['Q']) or (re.search(r'\b' + r'\b|\b'.join(obj.split()) + r'\b', cur_qa['Q'], re.IGNORECASE)) :
                cur_obj_qas.append(cur_qa)
        object_wise_dict[obj] = cur_obj_qas


    task_prompt_key = (
        f"Given the generated image and the list of question-answer pairs for each object, represented as {object_wise_dict}, "
        "choose the most accurately or visibly generated object from the list {key_objects_from_Q}. "
        "Prioritize selecting objects with a high number of answers rated 1.0 for each question."
        "Select the object that is both large and clearly visible, prioritizing prominent objects (such as animals, humans, or specific items) over background elements (like ocean or city). "
        "Return only the name of the best object to keep from the list, without additional explanation (e.g., 'dog')."
    )

    stop = False ; error_count = 0
    while not stop:
        try :
            local_response = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                {
                                                    "role": "user",
                                                    "content": [
                                                        {"type": "text", "text":  task_prompt_key},
                                                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{first_frame_img_gpt}"}},
                                                    ],
                                                }
                                            ],
                                            max_tokens=100,
                                        )
            preserve_object = local_response.choices[0].message.content
            stop = True

        except :
            print('ERROR..')
            time.sleep(9)
            error_count += 1
            if error_count > 3 :          # stop condition
                preserve_object = None
                stop = True

    return preserve_object, object_wise_dict


def paraphrasing_prompt(origin_prompt) :
        task_prompt_key = (
            f"Given the prompt: {origin_prompt}, generate 1 paraphrases of the initial prompt which keep the semantic meaning."
            "Respond with each new prompt in between <PROMPT> and </PROMPT>, eg: <PROMPT>paraphrase </PROMPT>. Answer using a single phrase. Do NOT generate any explanation, write only answer."
        )

        stop = False ; error_count = 0
        while not stop:
            try :
                new_prompt = client.chat.completions.create(
                                                model="gpt-4-0125",
                                                messages=[
                                                    {
                                                        "role": "user",
                                                        "content": [
                                                            {"type": "text", "text":  task_prompt_key},
                                                        ],
                                                    }
                                                ],
                                                max_tokens=100,
                                            )
                new_prompt = new_prompt.choices[0].message.content
                new_prompt = re.findall(r'<PROMPT>(.*?)</PROMPT>', new_prompt)
                stop = True

            except :
                print('ERROR..')
                time.sleep(9)
                error_count += 1
                if error_count > 3 :
                    local_prompt_answer = None
                    stop = True

            return new_prompt


def prompt_generator_from_Q(question_list) :

    system_prompt_local = (
        "You are an expert in rephrasing prompts for a text-to-video model based on the given questions."
    )

    task_prompt_local = (
        f"Given the following list of questions {question_list}, \
        create a single descriptive sentence that combines the meaning of each question into a natural, affirmative statement that provides a full, concise summary."
        "Your response should be a concise 1 phrase, without additional explanation.  (e.g., 'a small bear')"
        "Examples: "
    )

    examples = """

        - Example 1
            Question list: ['Is there a bed?', 'Is the bed blue?', 'Are the pillows beige?', 'Are the pillows with the bed?']
            Answer: "Blue bed with beige pillows."

        - Example 2
            Question list: [Are there three real bears?]
            Answer: "Three real bears."

        - Example 3
            Question list: [Are there two people?, Are the people making pizza?]
            Answer: "Two people making pizza.

        - Example 4
            Question list: [Is there a family?, Is there one cat?, Is there a park?, Is the family taking a walk?, Is the cat walking?, Is the family enjoying?, Is the family breathing fresh air?, Is the family exercising?]
            Answer: "A family and a cat are walking in the park."

        - Example 5
            Question list: [Is there a green bench?, Is there an orange tree?, Is the bench green?, Is the tree orange?]
            Answer: "Green bench and orange tree."

    Your Current Task: Your response should be a concise 1 phrase, without additional explanation (e.g., "a small bear")

    """

    stop = False ; error_count = 0
    while not stop:
        try :
            local_response = client.chat.completions.create(
                                            model="gpt-4-0125",
                                            messages=[
                                                {"role": "system", "content": system_prompt_local},
                                                {
                                                    "role": "user",
                                                    "content": [
                                                        {"type": "text", "text":  task_prompt_local + examples},
                                                    ],
                                                }
                                            ],
                                            max_tokens=100,
                                        )
            local_prompt_answer = local_response.choices[0].message.content
            stop = True

        except :
            print('ERROR..')
            time.sleep(9)
            error_count += 1
            if error_count > 3 :
                local_prompt_answer = None
                stop = True
    return local_prompt_answer