AI-MicroApps/app_visual_transcript.py at main · construct-admin/AI-MicroApps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import streamlit as st
import cv2
import tempfile
import os
from visual_transcription.src.api_calls import analyze_image_Azure_Vision_Analysis, analyze_image_gpt4  # Your custom function to call the API
from visual_transcription.utils.initialise_LLM_models import Azure_Vision_analyse_dict
from visual_transcription.utils.utilities import get_frame_timestamp, image_to_base64, insert_VT_into_AT
import json

# -----------------------------------------------
# Initialise some of the session_state values
# -----------------------------------------------

if "Azure Vision Add Captions" not in st.session_state:
    st.session_state["Azure Vision Add Captions"] = Azure_Vision_analyse_dict #TODO: Make more efficient - only one if statement required

if "gpt-4o" not in st.session_state:
    with open(r"visual_transcription\utils\chat_GPT.json", "r") as json_file:
        gpt4o_into = json.load(json_file)
        st.session_state['gpt-4o'] = {"prompt": gpt4o_into["prompt"], "max_words": gpt4o_into["max_words"]}
        st.session_state['gpt-4o']["prompt"] = st.session_state['gpt-4o']["prompt"].replace("%MAX_WORDS%", gpt4o_into["max_words"])

if "saved_frames" not in st.session_state:
    st.session_state.saved_frames = dict()
if 'video' not in st.session_state:
    st.session_state.video = None
if 'frame_number' not in st.session_state:
    st.session_state.frame_number = 0
if 'total_frames' not in st.session_state:
    st.session_state.total_frames = 0
if 'uploaded' not in st.session_state:
    st.session_state.uploaded = False
if 'uploaded_transcript_objects' not in st.session_state:
    st.session_state.uploaded_transcript_objects = dict()
if "audio_transcript" not in st.session_state:
    st.session_state.audio_transcript = []

st.title('Video Transcription Service')

if st.session_state.audio_transcript == []:
    uploaded_json = st.file_uploader('Drag and drop an audio transcript file here', type=['json'])
    if uploaded_json is not None:
        st.session_state.audio_transcript = json.load(uploaded_json)

# -----------------------------------------------
# Model Selection
# -----------------------------------------------
model_options = ['Azure Vision Add Captions', 'gpt-4o', 'Model C']
selected_model = st.selectbox('Select a model for visual transcription', model_options)
st.session_state['selected_model'] = selected_model

# -----------------------------------------------
# Display informaiton that is relevant to the selected_model
# -----------------------------------------------
if st.session_state['selected_model'] == "Azure Vision Add Captions":
    visual_features_model_options = ["TAGS", "OBJECTS", "CAPTION", "DENSE_CAPTIONS", "READ", "SMART_CROPS", "PEOPLE"]
    visual_features = st.multiselect('Select the visual aspects that the model should transcribe', visual_features_model_options)
    st.session_state['Azure Vision Add Captions']["visual_features"] = visual_features

if st.session_state['selected_model'] == "gpt-4o":

    filter_options = ["hate", "protected_material_code", "protected_material_text", "self_harm", "sexual", "violence"]
    selected_filters = st.multiselect('Select filters', filter_options)
    print("selected_filters = ", selected_filters)
    st.session_state['gpt-4o']["selected_filters"] = selected_filters

    if "hate" in selected_filters:
        severity_options = ["safe"]
        chosen_severity = "safe"
        # chosen_severity = st.selectbox('select the filter severity for hate', severity_options)
        st.session_state["gpt-4o"]["hate"] = chosen_severity

    if "protected_material_code" in selected_filters:
        severity_options = ["safe"]
        chosen_severity = "safe"
        # chosen_severity = st.selectbox('select the filter severity for protected_material_code', severity_options)
        st.session_state["gpt-4o"]["protected_material_code"] = chosen_severity

    if "protected_material_text" in selected_filters:
        severity_options = ["safe"]
        chosen_severity = "safe"
        # chosen_severity = st.selectbox('select the filter severity for protected_material_text', severity_options)
        st.session_state["gpt-4o"]["protected_material_text"] = chosen_severity

    if "self_harm" in selected_filters:
        severity_options = ["safe"]
        chosen_severity = "safe"
        # chosen_severity = st.selectbox('select the filter severity for self harm', severity_options)
        st.session_state["gpt-4o"]["self_harm"] = chosen_severity

    if "sexual" in selected_filters:
        severity_options = ["safe"]
        chosen_severity = "safe"
        # chosen_severity = st.selectbox('select the filter severity for sexsual', severity_options)
        st.session_state["gpt-4o"]["sexual"] = chosen_severity

    if "violence" in selected_filters:
        severity_options = ["safe"]
        chosen_severity = st.selectbox('select the filter severity for violence', severity_options)
        st.session_state["gpt-4o"]["violence"] = chosen_severity

    max_words = str(st.text_input(label="Please select the maximum number of words that the VT may be", value=st.session_state['gpt-4o']["max_words"]))

    st.session_state['gpt-4o']["prompt"] = st.session_state['gpt-4o']["prompt"].replace(st.session_state['gpt-4o']["max_words"], max_words)
    st.session_state['gpt-4o']["max_words"] = max_words

    # prompt = st.session_state['gpt-4o']["prompt"].replace(r"{max_words}", st.session_state['gpt-4o']["max_words"])
    prompt = st.session_state['gpt-4o']["prompt"]
    st.session_state['gpt-4o']["prompt"] = st.text_area(label="prompt", value=prompt)


# -----------------------------------------------
# File Uploader: Drag and Drop a Video File
# -----------------------------------------------
if not st.session_state.uploaded:
    uploaded_file = st.file_uploader('Drag and drop a video file here', type=['mp4', 'avi', 'mov'])
else:
    uploaded_file = None

if uploaded_file is not None:
    if not st.session_state.uploaded:
        st.success('Video uploaded successfully! Processing transcription...')
        # Save the uploaded video to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_file:
            temp_file.write(uploaded_file.read())
            temp_file_path = temp_file.name

        st.write(f'Temporary file path: {temp_file_path}')
        if not os.path.exists(temp_file_path):
            st.error('Temporary file was not created successfully.')
        else:
            st.session_state.video = cv2.VideoCapture(temp_file_path)
            if st.session_state.video.isOpened():
                st.session_state.total_frames = int(st.session_state.video.get(cv2.CAP_PROP_FRAME_COUNT))
                st.session_state.uploaded = True
            else:
                st.error('Could not open video file.')

# -----------------------------------------------
# Display Video Frames and Controls
# -----------------------------------------------
if st.session_state.uploaded:
    # Use a slider to select the frame number
    frame_number = st.slider('Select frame', 0, st.session_state.total_frames - 1, st.session_state.frame_number)
    st.session_state.frame_number = frame_number
    st.write(f"frame_number: {st.session_state.frame_number}")

    stframe = st.empty()
    st.session_state.video.set(cv2.CAP_PROP_POS_FRAMES, st.session_state.frame_number)
    ret, frame = st.session_state.video.read()
    if ret:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        stframe.image(frame_rgb, channels='RGB')
    else:
        st.error('Could not read the frame.')

# Navigation and Save Frame buttons
col1, col2, col3 = st.columns(3)
with col1:
    if st.button('Move Left'):
        if st.session_state.frame_number > 0:
            st.session_state.frame_number -= 1
with col2:
    if st.button('Save Frame Index'):
        # Ensure we capture the correct frame
        st.session_state.video.set(cv2.CAP_PROP_POS_FRAMES, st.session_state.frame_number)
        ret, frame = st.session_state.video.read()
        if ret:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            st.session_state.saved_frames[st.session_state.frame_number] = {
                'frame': frame_rgb,
                'has_visual_transcripts': False,
                'getting_visual_transcripts': False,
                'visual_transcripts': None
            }
            st.write(f'Saved frame index: {st.session_state.frame_number}')
        else:
            st.error('Could not capture the frame to save.')
with col3:
    if st.button('Move Right'):
        if st.session_state.frame_number < st.session_state.total_frames - 1:
            st.session_state.frame_number += 1

# -----------------------------------------------
# Sidebar: Display Saved Frames as Clickable Cards (using st.button)
# -----------------------------------------------
with st.sidebar:
    st.markdown("### Selected Frames")
    for frame_index, frame_info in sorted(st.session_state.saved_frames.items()):
        base64_img = image_to_base64(frame_info['frame'])
        transcript_text = frame_info['visual_transcripts'] if frame_info['visual_transcripts'] else 'No transcript yet'

        # Display the card using Markdown for styling
        st.markdown(f"""
            <div style="border: 1px solid #ccc; padding: 10px; margin-bottom: 10px;">
                <h4>Frame {frame_index}</h4>
                <img src="data:image/jpeg;base64,{base64_img}" style="width:100%;" />
                <p>{transcript_text}</p>
            </div>
        """, unsafe_allow_html=True)

        # Create a button below each card
        if not frame_info['has_visual_transcripts']:
            if st.button(f"Transcribe frame {frame_index}", key=f"btn_{frame_index}"):

                if st.session_state['selected_model'] == "Azure Vision Add Captions":
                    response = analyze_image_Azure_Vision_Analysis(frame_info['frame'], st.session_state[st.session_state['selected_model']]['client'], st.session_state['Azure Vision Add Captions']["visual_features"])
                    message = response["message"]


                elif st.session_state['selected_model'] == "gpt-4o":
                    response = analyze_image_gpt4(frame_info['frame'], st.session_state["gpt-4o"]["prompt"])
                    choices = response["choices"]
                    message = choices[0]["message"]["content"]

                # elif st.session_state['selected_model'] == "new model":
                #     response = MODEL_FUNCTION(INPUTS)
                #     choices = response["choices"]
                #     message = choices[0]["message"]["content"]

                else:
                    raise ValueError("Invalid model selected")

                print("This is the session state: ", st.session_state['selected_model'])

                st.session_state.saved_frames[frame_index]['visual_transcripts'] = message
                st.session_state.saved_frames[frame_index]['has_visual_transcripts'] = True
                st.session_state.saved_frames[frame_index]['time_stamp'] = get_frame_timestamp(frame_index, st.session_state.video)
                st.success("Visual transcriptions updated for frame nr: " + str(frame_index) + ".")

        # Button to add a frame to the transcript objects
        elif st.button(f"Add frame {frame_index} to transcript", key=f"add_{frame_index}"):
            st.session_state.uploaded_transcript_objects[frame_index] = st.session_state.saved_frames[frame_index]
            del st.session_state.saved_frames[frame_index]
            st.success("Frame " + str(frame_index) + " added to transcript objects.")

# -----------------------------------------------
# Sidebar: Display Uploaded Transcript Messages via a Select Slider
# -----------------------------------------------
with st.sidebar:
    st.markdown("### Uploaded Transcript Messages")

    st.session_state.audio_transcript

    if st.session_state.uploaded_transcript_objects:

        for frame_index, frame_info in sorted(st.session_state.uploaded_transcript_objects.items()):
            st.write(frame_info['visual_transcripts'], frame_info['time_stamp'])
            insert_VT_into_AT(frame_info)

            # Use a select slider (which allows discrete options) to choose a frame from the uploaded transcripts
            transcript_message = st.session_state.uploaded_transcript_objects[frame_index]['visual_transcripts']
        st.write("Transcript message:", transcript_message)
    else:
        st.write("No uploaded transcript objects yet.")

# -----------------------------------------------
# Sidebar: Refresh Button
# -----------------------------------------------
# with st.sidebar:
#     if st.button('Refresh'):
#         st.success('Reloaded')

# -----------------------------------------------
# Add Some Spacing at the Bottom
# -----------------------------------------------
st.markdown("<div style='height: 200px;'></div>", unsafe_allow_html=True)

# Optionally, you can release the video capture object on app exit
# if st.session_state.video is not None:
#     st.session_state.video.release()