VectorDB-and-LLM/image_utils.py at main · engineerjkk/VectorDB-and-LLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
from PIL import Image
from typing import Tuple, List
import base64
import requests
import os
import openai
import re
import json
from config import MY_OPENAI_API_KEY,MY_IMG_MODEL
import warnings
warnings.filterwarnings("ignore")

os.environ['OPENAI_API_KEY']= MY_OPENAI_API_KEY
openai.api_key = os.environ["OPENAI_API_KEY"]


def fetch_clip(model_name='openai/clip-vit-base-patch32') -> Tuple[CLIPModel, CLIPProcessor]:
    """
    CLIP model과 processor

    Returns:
        Tuple[CLIPModel, CLIPProcessor]: A tuple containing the CLIP model and processor.
    """
    model = CLIPModel.from_pretrained(model_name)
    processor = CLIPProcessor.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, processor, tokenizer

def extract_img_features(PIL_image: Image.Image, processor: CLIPProcessor, model: CLIPModel) -> torch.Tensor:
    """
    CLIP 모델을 활용하여 image embedding vector 추출

    Args:
        PIL_image (Image.Image): The image to process.
        processor (CLIPProcessor): The CLIP processor.
        model (CLIPModel): The CLIP model.

    Returns:
        torch.Tensor: The extracted image features.
    """
    inputs = processor(images=PIL_image, return_tensors="pt") # pytorch format
    outputs = model.get_image_features(**inputs)
    return outputs.detach()

def search_image(query_feature: torch.Tensor, features: List[torch.Tensor], topk: int = 10) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    주어진 vector들과 비교하여, query_feature와 유사한 vector의 index와 유사도를 제공함

    Args:
        query_feature (torch.Tensor): query image의 embedding vector
        features (List[torch.Tensor]): 이미지들의 embedding vector
        topk (int, optional): top-k

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: 유사한 이미지들의 index & cosine-distance
    """
    similarities = cosine_similarity(query_feature, torch.vstack(features)).flatten()
    # sort in descending order
    sorted_indices_desc = similarities.argsort()[::-1]
    topk_indices = sorted_indices_desc[0:topk]
    # cosine similarities for the top-k indices
    topk_similarities = similarities[topk_indices]

    return topk_indices, topk_similarities

def draw_images(images: List[Image.Image], texts: List[str]=None):
    """
    이미지 show

    Args:
        images (List[Image.Image]): 이미지 list
        texts (List[str], optional): 이미지 아래에 display 할 텍스트. Defaults to None.
    """
    k = len(images)
    if texts is None:
        texts = ['']*k
    # Set up the figure and axes for a 1x5 grid
    fig, axs = plt.subplots(1, k, figsize=(20, 4))

    if k == 1:
        axs = [axs]

    for i in range(k):
        axs[i].imshow(images[i])
        axs[i].axis('off')
        axs[i].text(0.5, -0.1, texts[i], va='bottom', ha='center', fontsize=10, transform=axs[i].transAxes)

    plt.show()


def tensor2np(tensor) -> np.ndarray:
    """
    Converts a PyTorch tensor to a NumPy array.

    Args:
        tensor (torch.Tensor): The PyTorch tensor to convert.

    Returns:
        np.ndarray: The converted NumPy array.
    """
    if tensor.is_cuda:
        numpy_array = tensor.cpu().numpy()
    else:
        numpy_array = tensor.numpy()

    return numpy_array

def detect_objects(img_path: str, model) -> Tuple[object, dict]:
    """
    이미지를 읽어오고, object detection 결과를 제공함

    Args:
        img_path (str): The path to the image file.
        model: The object detection model to use.

    Returns:
        Tuple[object, dict]: The raw results from the model and a dictionary containing detected boxes, scores,categories, and labels.
    """
    img = Image.open(img_path)
    results = model(img, size=1280, augment=True)

    pred_dict = dict()
    predictions =results.pred[0]

    pred_dict['boxes'] = tensor2np(predictions[:, :4]) # x1, y1, x2, y2
    pred_dict['scores'] = tensor2np(predictions[:, 4])
    pred_dict['categories'] = tensor2np(predictions[:, 5])

    categories = results.names
    pred_dict['labels'] = [categories[i] for i in pred_dict['categories']]

    return results, pred_dict

def crop_bbox(pil_image: Image.Image, bbox: Tuple[float, float, float, float]) -> Image.Image:
    """
    Crops a bounding box from an image.

    Args:
        pil_image (Image.Image): The PIL Image to crop.
        bbox (Tuple[float, float, float, float]): The bounding box coordinates (x_min, y_min, x_max, y_max).

    Returns:
        Image.Image: The cropped image.
    """
    x_min, y_min, x_max, y_max = bbox
    crop_box = (x_min, y_min, x_max, y_max)

    cropped_image = pil_image.crop(crop_box)

    return cropped_image

def normalize_image(pil_image: Image.Image, target_size: Tuple[int, int] = (224, 224)) -> Image.Image:
    """
    Normalizes an image by resizing it to a target size and scaling pixel values to [0, 1].

    Args:
        pil_image (Image.Image): The PIL Image to normalize.
        target_size (Tuple[int, int], optional): The target size (width, height). Defaults to (224, 224).

    Returns:
        Image.Image: The normalized image.
    """
    # resizing
    resized_image = pil_image.resize(target_size, Image.LANCZOS)

    # normalization
    np_image = np.array(resized_image).astype('float32')
    np_image /= 255.0  # pixel values to [0, 1]
    normalized_image = Image.fromarray((np_image * 255).astype('uint8'))
    return normalized_image


def filter_furniture(detections: Tuple[object, dict]) -> dict:
    """
    Detect 된 object 중 가구들만 선택하고 score>0.5 이상인 detection들만 선별함

    Args:
        detections (Tuple[object, dict]): YoloV5 output

    Returns:
        dict: A dictionary containing the filtered detections of furniture items.
    """

    furniture_class = [56, 57, 59, 60] # detections[0].names
    furniture_names = ['chair', 'couch', 'bed', 'dining table']
    furniture_detected = {}

    filter = [True if (i in furniture_names) and (s>0.5) else False for i, s in zip(detections[1]['labels'], detections[1]['scores'])]
    furniture_detected['boxes'] = detections[1]['boxes'][filter]
    furniture_detected['scores'] = detections[1]['scores'][filter]
    furniture_detected['categories'] = detections[1]['categories'][filter]
    furniture_detected['lables'] = [item for item, bool in zip(detections[1]['labels'], filter) if bool==True]

    return furniture_detected

def encode_image(image_path: str) -> str:
    """
    Encodes an image file to a base64 string.

    Args:
        image_path (str): The file path of the image to encode.

    Returns:
        str: The base64 encoded string of the image.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def describe_image(image_paths: List[str]) -> str:
    """
    GPT-4V를 활용하여 거실의 이미지를 4가지 카테고리를 기준으로 설명함.

    Args:
        image_paths (List[str]): List of paths to the images to be analyzed.

    Returns:
        str: A JSON-like string containing descriptions for each image focusing on 'Color Scheme', 'Lighting', 'Spatial Layout', and 'Architectural Features'.
    """

    text_prompt = """Please analyze the living room image provided.
Include 'Color Scheme', 'Lighting', 'Spatial Layout', and 'Architectural Features' with descriptions based on the room's characteristics.
The output should be formatted in a JSON-like dictionary structure. Each image should be done separately.

Example output :

```json
{
"Color Scheme": <Description about color scheme>,
"Lighting": <Description about lighting>,
"Spatial Layout": <Description about spatial layouts >,
"Architectural Features": <Descrption about architectural features>
}
```
    """

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai.api_key}"
        }
    imgs = [encode_image(i) for i in image_paths]

    payload = {
            "model": MY_IMG_MODEL,
            "messages": [{"role": "user",
                        "content": []
                        },
                        ],
            "max_tokens": 1000
            }

    img_contents = [{"type": "text", "text": text_prompt}]
    for img in imgs:
        input_template = {
            "type": "image_url",
            "image_url": {
            "url": f"data:image/jpeg;base64,{img}"
            }
        }
        img_contents.append(input_template)

    payload['messages'][0]['content'] = img_contents

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    output = response.json()['choices'][0]['message']['content']
    return output

def parse_response(text: str) -> List[dict]:
    """
    GPT-4V로부터 얻은 image description을 json 형태로 변환

    Args:
        text (str): The JSON-like string to parse.

    Returns:
        List[dict]: A list of dictionaries parsed from the input string.

    The function uses regular expressions to find JSON objects in the response and parses them into Python
    dictionaries.
    """

    matches = re.findall(r'(\{[\s\S]*?\})', text)
    matches = [json.loads(m) for m in matches]
    return matches
def reformat_img_description(input_content: str, model: str = MY_IMG_MODEL) -> dict:
    """
    parse_response를 활용하여 json의 형태로 변환이 안되는 경우, MY_IMG_MODEL를 활용하여 정형화된 포멧으로 변환

    Args:
        input_content (str): The unstructured text containing image descriptions.
        model (str, optional): The model to use for reformatting. Defaults to 'gpt-4-o'.

    Returns:
        dict: A dictionary containing the reformatted image descriptions.
    """
    output_formatting_prompt = """Using the provided text, find the smallest format of json there is and store them in a list as separate elements.
The ouput list should have two json objects found from the provided text.

Desired output :
{'list': [{'Image 1': {'Color Scheme': <Color Scheme>,
    'Lighting': <Lighting>,
    'Spatial Layout': <Spatial Layout>,
    'Architectural Features': <Architectural Features>}},
    {'Image 2': {'Color Scheme': <Color Scheme>,
    'Lighting': <Lighting>,
    'Spatial Layout': <Spatial Layout>,
    'Architectural Features': <Architectural Features>}}]}

Provided text : """

    client = openai.OpenAI()

    response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": 'You are a smart and intelligent program that understands information and provides output in JSON format'},
            {"role": "user", "content":output_formatting_prompt + input_content}
        ]
        )
    return response