forked from TevinWang/code-improvement-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgithub_scraper.py
More file actions
49 lines (39 loc) · 1.69 KB
/
github_scraper.py
File metadata and controls
49 lines (39 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import fnmatch
import os
import base64
import requests
from dotenv import load_dotenv
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
def fetch_python_files_from_github_url(url, token=GITHUB_TOKEN):
owner, repo = url.strip("/").split("/")[-2:]
tree_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1"
headers = {
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3+json",
}
print(tree_url)
tree_response = requests.get(tree_url, headers=headers)
if tree_response.status_code != 200:
raise ValueError(f"Error fetching repo contents: {tree_response.status_code}")
contents_list = []
for file in tree_response.json()["tree"]:
if file["type"] == "blob" and fnmatch.fnmatch(file["path"], "*.py"):
file_url = file["url"]
print(f"scrapping file: {file['path']}")
file_response = requests.get(file_url, headers=headers)
if file_response.status_code == 200:
content_base64 = file_response.json()["content"]
content_decoded = base64.b64decode(content_base64).decode("utf-8")
contents_list.append((file["path"], content_decoded))
return contents_list
if __name__ == "__main__":
github_url = "https://github.com/benthecoder/ClassGPT"
python_files = fetch_python_files_from_github_url(github_url)
with open("python_files.txt", "w") as f:
f.write('<files>')
for file_path, file_content in python_files:
f.write(
f"<file>\n<file_path>{file_path}</file_path>\n<file_content>\n{file_content}\n</file_content>\n</file>\n"
)
f.write('</files>')