visual_doc_demo/utils.py at main · EPICLab/visual_doc_demo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import requests
from collections import defaultdict
import regex as re
from datetime import datetime
import json
import os
import tiktoken

def download_file(owner, repo, file_path):
    # GitHub repository information. Example:
    # owner = 'flutter'
    # repo = 'flutter'
    # file_path = 'CONTRIBUTING.md'

    # Construct the URL to download the raw file
    raw_url = f'https://raw.githubusercontent.com/{owner}/{repo}/main/{file_path}'

    try:
        # Send a GET request to download the file
        response = requests.get(raw_url)
        if response.status_code == 200:
            print("Response code 200")
            response.encoding = 'UTF-8'
            # If the request is successful, return the file as a response
            return(response.content)
        else:
            return f"Failed to download file: {response.status_code}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

def segregate_segments_by_classes(segments_and_classes_in_all_files):
    print("Started segregating segments ...")
    #Find all distinct classes in the files
    all_classes =[]
    for segments_and_classes_in_each_file in segments_and_classes_in_all_files:
        classes_in_that_file = segments_and_classes_in_each_file[1]
        print(f"Classes in that file = ", classes_in_that_file)
        all_classes = all_classes + classes_in_that_file
    all_classes = set(all_classes)
    print(f"All distinct classes found =  {all_classes}")
    #For each segment class, initialize a list to hold the segments.
    class_segments_holder = {}
    for each_class in all_classes:
        class_segments_holder[each_class] = []

    #For each segment, add it to correct class in the dictionary
    for segments_and_classes_in_each_file in segments_and_classes_in_all_files:
        segments = segments_and_classes_in_each_file[0]
        segment_classes = segments_and_classes_in_each_file[1]
        for segment, segment_class in zip(segments, segment_classes):
            class_segments_holder[segment_class].append(segment)
    print("Completed segregating segments ...")
    return class_segments_holder

def modfify_json_for_ui(old_json, repo_name):
    new_json = {"content": {}, "flow": []}

    # Dictionary to keep track of occurrences of each step
    step_counter = defaultdict(int)
    step_rename_map = {}  # Maps old step names to new step names
    step_rename_map['Parent Node'] = f'Contributing to {repo_name}'

    for sequence_name, sequence_data in old_json.items():
        # Process content
        updated_content = {}
        for step, description in sequence_data["content"].items():
            step_counter[step] += 1
            new_step_name = f"{step} #{step_counter[step]}" if step_counter[step] > 1 else step
            step_rename_map[step] = new_step_name  # Update the map
            updated_content[new_step_name] = description

        # Add updated content to new_json
        new_json["content"].update(updated_content)

        # Process flow edges
        updated_edges = []
        for edge in sequence_data["flow"]["edges"]:
            source = step_rename_map[edge["source"]]
            target = step_rename_map[edge["target"]]
            updated_edges.append({"source": source, "target": target})

        # Add flow with updated edges and sequence name
        new_json["flow"].append({"edges": updated_edges, "sequence": sequence_name})
    return new_json


def modify_json_for_ui_without_classifier(old_json, repo_name):
    new_json = {"content": {}, "flow": []}

    # Dictionary to keep track of occurrences of each step
    step_counter = defaultdict(int)
    step_rename_map = {}  # Maps old step names to new step names
    step_rename_map['Parent Node'] = f'Contributing to {repo_name}'

    updated_content = {}
    for  topic, content in old_json.items():
        step_counter[topic] += 1
        new_step_name = f"{topic} #{step_counter[topic]}" if step_counter[topic] > 1 else topic
        step_rename_map[topic] = new_step_name  # Update the map
        updated_content[new_step_name] = content

    # Add updated content to new_json
    new_json["content"] = updated_content
    # print(updated_content)
    # Process flow edges
    for topic, content in updated_content.items():
        source = step_rename_map['Parent Node']
        target = topic
        # Add flow with updated edges and sequence name
        new_json["flow"].append({"edges": [{"source": source, "target": target}], "sequence": target})
    # print(new_json)
    return new_json

def add_links_to_json_from_content(data):
    #Extract links from each content topic
    links_dict = {}
    for topic, markdown_content in data["content"].items():
        links_dict[topic] = extract_links_from_markdown(markdown_content)

    #Add the 'links' key to the JSON structure
    data["links"] = links_dict

    return data

def extract_links_from_markdown(markdown_text):
    """
    Extract all links from a Markdown string.
    Returns a list of link URLs (including relative paths, anchors, etc.).
    """
    # This regex captures the URL within parentheses following a standard Markdown link [text](URL).
    pattern = re.compile(r'\[[^\]]*\]\(([^)]+)\)')
    return pattern.findall(markdown_text)


def save_llm_output(json_output, file_prefix):
        now = datetime.now()
        current_directory = os.getcwd()
        # Format the date and time as a string
        formatted = now.strftime("%Y-%m-%d %H:%M:%S")
        filename =  current_directory + '/static/llm_ouput/' + file_prefix + '_' + formatted +'.json'
        with open(filename, "w") as file:
            json.dump(json_output, file, indent=4)

def exceeds_token_size(text, max_tokens):
    '''
    checks if the given text exceeds the given max_tokens number.
    '''
    encoding = tiktoken.encoding_for_model("gpt-4o")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        return True
    else:
        return False


if __name__ == '__main__':
    print(download_file('flutter', 'flutter', 'CONTRIBUTING.md'))