drcode/github_scraper.py at main · unkn-wn/drcode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import fnmatch
import os
import base64
import requests
from dotenv import load_dotenv

load_dotenv()


GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")


def fetch_python_files_from_github_url(url, token=GITHUB_TOKEN):
    owner, repo = url.strip("/").split("/")[-2:]
    tree_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1"
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print(tree_url)
    tree_response = requests.get(tree_url, headers=headers)
    if tree_response.status_code != 200:
        raise ValueError(f"Error fetching repo contents: {tree_response.status_code}")

    contents_list = []
    for file in tree_response.json()["tree"]:
        if file["type"] == "blob" and fnmatch.fnmatch(file["path"], "*.py"):
            file_url = file["url"]
            print(f"scrapping file: {file['path']}")
            file_response = requests.get(file_url, headers=headers)
            if file_response.status_code == 200:
                content_base64 = file_response.json()["content"]
                content_decoded = base64.b64decode(content_base64).decode("utf-8")
                contents_list.append((file["path"], content_decoded))
    return contents_list


if __name__ == "__main__":
    github_url = "https://github.com/benthecoder/ClassGPT"
    python_files = fetch_python_files_from_github_url(github_url)

    with open("python_files.txt", "w") as f:
        f.write('<files>')
        for file_path, file_content in python_files:
            f.write(
                f"<file>\n<file_path>{file_path}</file_path>\n<file_content>\n{file_content}\n</file_content>\n</file>\n"
            )
        f.write('</files>')