diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..fcd84df --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,16 @@ +name: Tests + +on: + pull_request: + # The branches below must be a subset of the branches above + branches: [main] + +jobs: + lint: + runs-on: [self-hosted, cluster.loopingz.com] + steps: + - uses: actions/checkout@v2 + # black formatting + - uses: psf/black@stable + with: + options: "--line-length 100 --check --diff" diff --git a/.gitignore b/.gitignore index b7faf40..335eb43 100644 --- a/.gitignore +++ b/.gitignore @@ -205,3 +205,6 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# Configuration +config.yml \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..c11fa8b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, + + "black-formatter.args": ["--line-length", "100"], + "pylint.args": ["--disable=W0401, W0614, W1514"] +} \ No newline at end of file diff --git a/README.md b/README.md index 8da0517..a72287d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,22 @@ -# modalshift +# MODALSHIFT + Open developments for MODALSHIFT EU project + +## Configuration file + +Put a file name config.yml at root. + +Here is an example configuration. + +```yaml +S3_server: https://url +S3_id_env_name: S3_ID +S3_secret_env_name: S3_SECRET +output_directory: data/ +``` + +## Download a file + +```python +python3 src/get_data_from_S3.py +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..97033fe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +black~=25.1.0 +tqdm==4.66.4 +botocore==1.41.0 +boto3==1.41.0 \ No newline at end of file diff --git a/src/functions.py b/src/functions.py new file mode 100644 index 0000000..52b4731 --- /dev/null +++ b/src/functions.py @@ -0,0 +1,91 @@ +""" +Functions +""" + +import os +import boto3 +import botocore +import tqdm +import yaml + +# get configuration +with open("../config.yml", "r") as ymlfile: + config = yaml.load(ymlfile, Loader=yaml.FullLoader) + +S3_SERVER = config["S3_server"] +S3_ID_ENV_NAME = config["S3_id_env_name"] +S3_SECRET_ENV_NAME = config["S3_secret_env_name"] +OUTPUT_DIRECTORY = config["output_directory"] +DOWNLOAD_FILE = config["download_file"] +OUTPUT_FILE = config["output_file"] + +# make output directory +if not os.path.exists(OUTPUT_DIRECTORY): + os.mkdir(OUTPUT_DIRECTORY) + +# init S3 session +session = boto3.Session() + +S3_CLIENT = session.client( + service_name="s3", + aws_access_key_id=os.environ[S3_ID_ENV_NAME], + aws_secret_access_key=os.environ[S3_SECRET_ENV_NAME], + endpoint_url=S3_SERVER, +) + + +def download_object_from_s3_with_progress(client, *, bucket, key, version_id=None, filename): + """ + Download an object from S3 with a progress bar. + + From https://alexwlchan.net/2021/04/s3-progress-bars/ + """ + s3 = client + + # First get the size, so we know what tqdm is counting up to. + # Theoretically the size could change between this HeadObject and starting + # to download the file, but this would only affect the progress bar. + kwargs = {"Bucket": bucket, "Key": key} + + if version_id is not None: + kwargs["VersionId"] = version_id + + object_size = s3.head_object(**kwargs)["ContentLength"] + + # Now actually download the object, with a progress bar to match. + # How this works: + # + # - We take manual control of tqdm() using a ``with`` statement, + # see https://pypi.org/project/tqdm/#manual + # + # - We set ``unit_scale=True`` so tqdm uses SI unit prefixes, and + # ``unit="B"`` means it adds a "B" as a suffix. This means we get + # progress info like "14.5kB/s". + # + # (Note: the "B" is just a string; tqdm doesn't know these are + # bytes and doesn't care.) + # + # - The Callback method on a boto3 S3 function is called + # periodically during the download with the number of bytes + # transferred. We can use it to update the progress bar. + # + if version_id is not None: + ExtraArgs = {"VersionId": version_id} + else: + ExtraArgs = None + + with tqdm.tqdm(total=object_size, unit="B", unit_scale=True, desc=filename) as pbar: + download_object_from_s3( + s3, + bucket, + key, + filename, + ExtraArgs, + lambda bytes_transferred: pbar.update(bytes_transferred), + ) + + +def download_object_from_s3(client, bucket, key, filename, extraArgs=None, callback=None): + client.download_file( + Bucket=bucket, Key=key, ExtraArgs=extraArgs, Filename=filename, Callback=callback + ) diff --git a/src/get_data_from_S3.py b/src/get_data_from_S3.py new file mode 100644 index 0000000..7db2ea7 --- /dev/null +++ b/src/get_data_from_S3.py @@ -0,0 +1,26 @@ +""" +Get data from S3 +""" + +# %% +# imports + +from functions import ( + download_object_from_s3_with_progress, + OUTPUT_DIRECTORY, + S3_CLIENT, + DOWNLOAD_FILE, + OUTPUT_FILE, +) + +# %% +# download + +download_object_from_s3_with_progress( + S3_CLIENT, + bucket="tellae-mobility-bots", + key=DOWNLOAD_FILE, + filename=f"{OUTPUT_DIRECTORY}/{OUTPUT_FILE}", +) + +# %% diff --git a/src/list_files.py b/src/list_files.py new file mode 100644 index 0000000..6396d65 --- /dev/null +++ b/src/list_files.py @@ -0,0 +1,80 @@ +""" +Get data from S3 +""" + +# %% +# imports + +from functions import ( + OUTPUT_DIRECTORY, + S3_CLIENT, + botocore, + download_object_from_s3_with_progress, +) + + +# %% +# + +# %% +# test listing folders and counting files + + +def list_folder_objects(client, bucket, prefix): + file_list = [] + for key in client.list_objects(Bucket=bucket, Prefix=prefix)["Contents"]: + file_list.append(key["Key"]) + + return file_list + + +bucket = "tellae-mobility-bots" +prefix_list = [ + "GTFS-RT/renfe/", + "gtfs/renfe-cercanias/", + "gtfs/", + "gbfs/", + "GTFS-RT/bibus/", +] # Make sure you provide / in the end + +for prefix in prefix_list: + print("******", prefix, "******") + try: + result = S3_CLIENT.list_objects(Bucket=bucket, Prefix=prefix, Delimiter="/") + + common_prefixes = result.get("CommonPrefixes") + if not common_prefixes is None: + for o in result.get("CommonPrefixes"): + print("sub folder : ", o.get("Prefix")) + print("# files: ", len(list_folder_objects(S3_CLIENT, bucket, prefix))) + else: + print("no sub folder") + print("# files: ", len(list_folder_objects(S3_CLIENT, bucket, prefix))) + + except botocore.exceptions.ClientError as e: + print(e) + +# %% +# test downloading files + +files = [ + "gtfs/madrid-metro-ligero/gtfs_madrid-metro-ligero_2025-11-11.zip", + "GTFS-RT/renfe/bus_position_20250907_renfe.csv.bz2", + "gbfs/auray/2025-02-02_auray.csv.bz2", + "GTFS-RT/bibus/bus_position_20240102_bibus.csv.bz2", +] + +for file in files: + print("*******", file, "******") + try: + download_object_from_s3_with_progress( + client=S3_CLIENT, + bucket=bucket, + key=file, + filename=f"{OUTPUT_DIRECTORY}/{file.replace('/', '_')}", + ) + except botocore.exceptions.ClientError as e: + print(e) + +# %% +#