diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..546c37b80 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +* text=auto +*.py text eol=lf +*.sh text eol=lf diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e2ca0093d..837ac9eb2 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,32 +1,58 @@ -/augur/cli/ @sgoggins -/augur/metrics/ @sgoggins @sgoggins -/augur/housekeeper/ @sgoggins -/augur/server.py @sgoggins @sgoggins -/augur/application.py @sgoggins @sgoggins @sgoggins -/augur/routes/ @sgoggins @sgoggins @sgoggins - -/frontend/ @sgoggins @sgoggins - -/schema/ @sgoggins @sgoggins - -/schema/generate @sgoggins - -/workers/ @sgoggins @sgoggins @sgoggins - -/README.md @sgoggins @sgoggins -/CONTRIBUTING.md @sgoggins @sgoggins -/LICENSE @sgoggins @sgoggins -/CODE_OF_CONDUCT.md @sgoggins @sgoggins - -/tests/ @sgoggins -/util/docker @sgoggins -/scripts/ @sgoggins -/docs/ @sgoggins -/.dockerignore @sgoggins -/.travis.yml @sgoggins -*compose.yml @sgoggins -/readthedocs.yml @sgoggins -/Makefile @sgoggins -/tox.ini @sgoggins -*requirements.txt @sgoggins -/setup.py @sgoggins @sgoggins +# API +/collectoss/api/ @MoralCode + +/collectoss/api/view @MoralCode @Ulincsys +/collectoss/api/routes/user.py @MoralCode @Ulincsys + +# CLI +/collectoss/cli/ @MoralCode + +# Frontend +/collectoss/templates/ @MoralCode + +# Schema +/collectoss/application/db/ @MoralCode +/collectoss/application/schema/ @MoralCode + +# Workers +/collectoss/tasks @MoralCode + +# Repo and Community Infrastructure +/README.md @MoralCode +/CONTRIBUTING.md @MoralCode +/CREDITS.md @MoralCode +/MAINTAINERS.md @MoralCode +SECURITY.md @MoralCode +.github/ @MoralCode +pyproject.toml @MoralCode + +# Testing +/tests/ @MoralCode +/scripts/ci @MoralCode + +# Keyman +/keyman @MoralCode @Ulincsys +/tests/test_key_manager @MoralCode @Ulincsys + +# Docs +/docs/ @MoralCode +/readthedocs.yml @MoralCode + +# Install - Manual +/Makefile @MoralCode +/scripts/control @MoralCode +/scripts/install @MoralCode +/scripts/mat_view_explore @MoralCode + +# Install - Containers +/docker @MoralCode +/scripts/docker @MoralCode + +/.dockerignore @MoralCode +*compose.yml @MoralCode + +# Misc +/collectoss/metrics/ @MoralCode +/collectoss/server.py @MoralCode +/collectoss/application.py @MoralCode +/collectoss/application/config.py @MoralCode diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 324c8d17e..155a8ca62 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -20,9 +20,9 @@ A clear and concise description of what you expected to happen. If applicable, add screenshots to help explain your problem. If your bug is related to the UI, you **must** include screenshots. **Log files** -Attach the relevant log files here. Server and installation logs can be found in the `logs/` directory in the root `augur/` directory, and the logs for each worker are stored in their respective directories. If the logs are pretty long (> 50ish lines or just use your best judgement) please use a Gist or a [pastebin](https://pastebin.com/). **These logs file are required if you would like help solving your issue.** +Attach the relevant log files here. Logs are most commonly found using docker's native logging. If the logs are pretty long (> 50ish lines or just use your best judgement) please use a Gist or a [pastebin](https://pastebin.com/). **Including logs helps us help you substantially faster.** **Software versions:** - - Augur: (you can use `pip show augur` to find your version) + - CollectOSS: (which release version or git hash are you running CollectOSS from?) - OS: (`sw_vers` for macOS, `lsb_release -a` on Linux) - Browser: (if applicable) \ No newline at end of file diff --git a/.github/SECURITY_ADVISORY_TEMPLATE.md b/.github/SECURITY_ADVISORY_TEMPLATE.md new file mode 100644 index 000000000..2d3eec8a2 --- /dev/null +++ b/.github/SECURITY_ADVISORY_TEMPLATE.md @@ -0,0 +1,20 @@ +### Description +Provide a clear and concise description of the vulnerability. + +### Impact +What kind of damage could this cause? (e.g., unauthorized data access, service disruption). + +### Reproduction Steps +Please provide a proof-of-concept or clear, step-by-step instructions to reproduce the issue. + +### Affected Versions +Which versions of CollectOSS are confirmed to be affected? + +### AI Disclosure +**Did you use Artificial Intelligence (AI) tools to identify or document this vulnerability?** +- [ ] Yes +- [ ] No +*If yes, please specify which tool was used and which parts of the report were generated or assisted by AI.* + +### Suggested Fix +If you have a proposed fix or patch, please describe it here. \ No newline at end of file diff --git a/.github/profile/README.md b/.github/profile/README.md index fa406baec..e69de29bb 100644 --- a/.github/profile/README.md +++ b/.github/profile/README.md @@ -1,8 +0,0 @@ -# Welcome! -![Augur Landing](chaosscon.jpg) - -Augur is part of the CHAOSS Community, and focuses on the accurate, verifiable collection of open source software health and sustainability data. - -**Through May 18, 2022, Augur will not be issuing new releases. This freeze is due to a highly distributed documentation update effort, and we don't want to have to resolve any more merge conflicts than are necessary** - -Great things are coming! Stay Tuned! diff --git a/.github/profile/augur-landing.jpg b/.github/profile/augur-landing.jpg deleted file mode 100644 index ac9fbf930..000000000 Binary files a/.github/profile/augur-landing.jpg and /dev/null differ diff --git a/.github/profile/chaosscon.jpg b/.github/profile/chaosscon.jpg deleted file mode 100644 index 11538d803..000000000 Binary files a/.github/profile/chaosscon.jpg and /dev/null differ diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 2969abeae..437997d1d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -16,5 +16,5 @@ Contributing Conventions: 2. Build and test your changes before submitting a PR. 3. Sign your commits -By following the community's [contribution conventions](https://github.com/chaoss/augur/blob/main/CONTRIBUTING.md) upfront, the review process will be accelerated and your PR merged more quickly. +By following the community's [contribution conventions](https://github.com/chaoss/collectoss/blob/main/CONTRIBUTING.md) upfront, the review process will be accelerated and your PR merged more quickly. --> \ No newline at end of file diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 27bcee3fb..a4dce4c2b 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -1,4 +1,4 @@ template: | - ## What’s Changed + ## What's Changed $CHANGES diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml new file mode 100644 index 000000000..138b175f5 --- /dev/null +++ b/.github/workflows/bandit.yml @@ -0,0 +1,51 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# Bandit is a security linter designed to find common security issues in Python code. +# This action will run Bandit on your codebase. +# The results of the scan will be found under the Security tab of your repository. + +# https://github.com/marketplace/actions/bandit-scan is ISC licensed, by abirismyname +# https://pypi.org/project/bandit/ is Apache v2.0 licensed, by PyCQA + +name: Bandit +on: + push: + branches: [ "main", "release" ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ "main" ] + schedule: + - cron: '24 2 * * 2' + +jobs: + bandit: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Bandit Scan + uses: shundor/python-bandit-scan@ab1d87dfccc5a0ffab88be3aaac6ffe35c10d6cd + with: # optional arguments + # exit with 0, even with results found + exit_zero: true # optional, default is DEFAULT + # Github token of the repository (automatically created by Github) + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information. + # File or directory to run bandit on + # path: # optional, default is . + # Report only issues of a given severity level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) + # level: # optional, default is UNDEFINED + # Report only issues of a given confidence level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) + # confidence: # optional, default is UNDEFINED + # comma-separated list of paths (glob patterns supported) to exclude from scan (note that these are in addition to the excluded paths provided in the config file) (default: .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg) + excluded_paths: tests + # comma-separated list of test IDs to skip + # skips: # optional, default is DEFAULT + # path to a .bandit file that supplies command line arguments + # ini_path: # optional, default is DEFAULT + diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 3a0e3f953..ec40bc11d 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -17,7 +17,7 @@ jobs: name: Test on macOS runs-on: macos-latest env: - UV_LOCKED: true # Assert that uv.lock is up-to-date + UV_LOCKED: true # Assert that uv.lock is up-to-date steps: - name: Checkout repository uses: actions/checkout@v4 @@ -41,7 +41,7 @@ jobs: run: uv sync --all-groups - name: Install workers - run: uv run scripts/install/workers.sh dev + run: uv run scripts/ci/workers.sh dev - name: Install nltk run: | @@ -50,7 +50,6 @@ jobs: uv run python -m nltk.downloader popular uv run python -m nltk.downloader universal_tagset - test-e2e: name: End-to-end test (Docker) runs-on: ubuntu-latest @@ -59,7 +58,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet sudo rm -rf "$AGENT_TOOLSDIRECTORY" - + - name: Checkout repository uses: actions/checkout@v4 @@ -79,9 +78,11 @@ jobs: with: context: . file: ./docker/database/Dockerfile - build-args: VERSION=${{ steps.version.outputs.version }} + build-args: | + VERSION=${{ steps.version.outputs.version }} + REVISION=${{ github.sha }} platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_database:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss-database:test cache-from: type=gha,scope=container-database cache-to: type=gha,scope=container-database,mode=min load: true @@ -91,9 +92,11 @@ jobs: with: context: . file: ./docker/keyman/Dockerfile - build-args: VERSION=${{ steps.version.outputs.version }} + build-args: | + VERSION=${{ steps.version.outputs.version }} + REVISION=${{ github.sha }} platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_keyman:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss-keyman:test cache-from: type=gha,scope=container-keyman cache-to: type=gha,scope=container-keyman,mode=min load: true @@ -103,9 +106,11 @@ jobs: with: context: . file: ./docker/rabbitmq/Dockerfile - build-args: VERSION=${{ steps.version.outputs.version }} + build-args: | + VERSION=${{ steps.version.outputs.version }} + REVISION=${{ github.sha }} platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_rabbitmq:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss-rabbitmq:test cache-from: type=gha,scope=container-rabbitmq cache-to: type=gha,scope=container-rabbitmq,mode=min load: true @@ -115,28 +120,30 @@ jobs: with: context: . file: ./docker/backend/Dockerfile - build-args: VERSION=${{ steps.version.outputs.version }} + build-args: | + VERSION=${{ steps.version.outputs.version }} + REVISION=${{ github.sha }} platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_backend:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss:test cache-from: type=gha,scope=container-backend cache-to: type=gha,scope=container-backend,mode=min load: true - name: Prepare compose file run: | - yq eval -i '.services.augur.image = "ghcr.io/${{ github.repository_owner }}/augur_backend:test"' docker-compose.yml - yq eval -i '.services.augur.pull_policy = "never"' docker-compose.yml - yq eval -i '.services.augur.restart = "no"' docker-compose.yml + yq eval -i '.services.core.image = "ghcr.io/${{ github.repository_owner }}/collectoss:test"' docker-compose.yml + yq eval -i '.services.core.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.core.restart = "no"' docker-compose.yml - yq eval -i '.services.augur-db.image = "ghcr.io/${{ github.repository_owner }}/augur_database:test"' docker-compose.yml - yq eval -i '.services.augur-db.pull_policy = "never"' docker-compose.yml - yq eval -i '.services.augur-db.restart = "no"' docker-compose.yml + yq eval -i '.services.database.image = "ghcr.io/${{ github.repository_owner }}/collectoss-database:test"' docker-compose.yml + yq eval -i '.services.database.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.database.restart = "no"' docker-compose.yml - yq eval -i '.services.augur-keyman.image = "ghcr.io/${{ github.repository_owner }}/augur_keyman:test"' docker-compose.yml - yq eval -i '.services.augur-keyman.pull_policy = "never"' docker-compose.yml - yq eval -i '.services.augur-keyman.restart = "no"' docker-compose.yml + yq eval -i '.services.keyman.image = "ghcr.io/${{ github.repository_owner }}/collectoss-keyman:test"' docker-compose.yml + yq eval -i '.services.keyman.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.keyman.restart = "no"' docker-compose.yml - yq eval -i '.services.rabbitmq.image = "ghcr.io/${{ github.repository_owner }}/augur_rabbitmq:test"' docker-compose.yml + yq eval -i '.services.rabbitmq.image = "ghcr.io/${{ github.repository_owner }}/collectoss-rabbitmq:test"' docker-compose.yml yq eval -i '.services.rabbitmq.pull_policy = "never"' docker-compose.yml yq eval -i '.services.rabbitmq.restart = "no"' docker-compose.yml @@ -182,7 +189,8 @@ jobs: # Always run this step to get logs, even if the previous step fails if: always() # We use tail so that we can see the name of each file as it's printed - run: "docker run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs tail -n +0'" + run: "docker run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs + tail -n +0'" test-e2e-podman: name: End-to-end test (Podman) @@ -206,7 +214,7 @@ jobs: containerfiles: | ./docker/database/Dockerfile platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_database:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss-database:test layers: true - name: Build keyman container @@ -216,7 +224,7 @@ jobs: containerfiles: | ./docker/keyman/Dockerfile platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_keyman:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss-keyman:test layers: true - name: Build rabbitmq container @@ -226,7 +234,7 @@ jobs: containerfiles: | ./docker/rabbitmq/Dockerfile platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_rabbitmq:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss-rabbitmq:test layers: true - name: Build backend container @@ -236,24 +244,24 @@ jobs: containerfiles: | ./docker/backend/Dockerfile platforms: linux/amd64 - tags: ghcr.io/${{ github.repository_owner }}/augur_backend:test + tags: ghcr.io/${{ github.repository_owner }}/collectoss:test layers: true - name: Prepare compose file run: | - yq eval -i '.services.augur.image = "ghcr.io/${{ github.repository_owner }}/augur_backend:test"' docker-compose.yml - yq eval -i '.services.augur.pull_policy = "never"' docker-compose.yml - yq eval -i '.services.augur.restart = "no"' docker-compose.yml + yq eval -i '.services.core.image = "ghcr.io/${{ github.repository_owner }}/collectoss:test"' docker-compose.yml + yq eval -i '.services.core.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.core.restart = "no"' docker-compose.yml - yq eval -i '.services.augur-db.image = "ghcr.io/${{ github.repository_owner }}/augur_database:test"' docker-compose.yml - yq eval -i '.services.augur-db.pull_policy = "never"' docker-compose.yml - yq eval -i '.services.augur-db.restart = "no"' docker-compose.yml + yq eval -i '.services.database.image = "ghcr.io/${{ github.repository_owner }}/collectoss-database:test"' docker-compose.yml + yq eval -i '.services.database.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.database.restart = "no"' docker-compose.yml - yq eval -i '.services.augur-keyman.image = "ghcr.io/${{ github.repository_owner }}/augur_keyman:test"' docker-compose.yml - yq eval -i '.services.augur-keyman.pull_policy = "never"' docker-compose.yml - yq eval -i '.services.augur-keyman.restart = "no"' docker-compose.yml + yq eval -i '.services.keyman.image = "ghcr.io/${{ github.repository_owner }}/collectoss-keyman:test"' docker-compose.yml + yq eval -i '.services.keyman.pull_policy = "never"' docker-compose.yml + yq eval -i '.services.keyman.restart = "no"' docker-compose.yml - yq eval -i '.services.rabbitmq.image = "ghcr.io/${{ github.repository_owner }}/augur_rabbitmq:test"' docker-compose.yml + yq eval -i '.services.rabbitmq.image = "ghcr.io/${{ github.repository_owner }}/collectoss-rabbitmq:test"' docker-compose.yml yq eval -i '.services.rabbitmq.pull_policy = "never"' docker-compose.yml yq eval -i '.services.rabbitmq.restart = "no"' docker-compose.yml @@ -297,9 +305,8 @@ jobs: # Always run this step to get logs, even if the previous step fails if: always() # We use tail so that we can see the name of each file as it's printed - run: "podman run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs tail -n +0'" - - + run: "podman run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs + tail -n +0'" push-image: name: Push image @@ -312,10 +319,14 @@ jobs: strategy: matrix: image: - - backend - - database - - keyman - - rabbitmq + - name: collectoss + folder: backend + - name: collectoss-database + folder: database + - name: collectoss-keyman + folder: keyman + - name: collectoss-rabbitmq + folder: rabbitmq runs-on: ubuntu-latest steps: - name: Checkout repository @@ -339,10 +350,10 @@ jobs: DOCKER_METADATA_ANNOTATIONS_LEVELS: index,manifest with: annotations: | - org.opencontainers.image.title=augur_${{ matrix.image}} + org.opencontainers.image.title=${{ matrix.image.name }} labels: | - org.opencontainers.image.title=augur_${{ matrix.image}} - images: ghcr.io/${{ github.repository_owner }}/augur_${{ matrix.image }} + org.opencontainers.image.title=${{ matrix.image.name }} + images: ghcr.io/${{ github.repository_owner }}/${{ matrix.image.name }} # Pushes to the main branch update the *:devel-latest tag # Releases update the *:latest tag and the *: tag tags: | @@ -356,12 +367,12 @@ jobs: with: annotations: ${{ steps.meta.outputs.annotations }} context: . - file: ./docker/${{ matrix.image }}/Dockerfile + file: ./docker/${{ matrix.image.folder }}/Dockerfile labels: ${{ steps.meta.outputs.labels }} platforms: linux/amd64 # Only push if we've tagged the image in the metadata step push: ${{ steps.meta.outputs.tags != '' }} tags: ${{ steps.meta.outputs.tags }} # Use the same cache as the build step - cache-from: type=gha,scope=container-${{ matrix.image }} - cache-to: type=gha,scope=container-${{ matrix.image }},mode=min + cache-from: type=gha,scope=container-${{ matrix.image.name }} + cache-to: type=gha,scope=container-${{ matrix.image.name }},mode=min diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 8b2606fbf..6350abf2f 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -27,7 +27,7 @@ jobs: reporter: github-pr-review level: warning glob_pattern: "**/*.py" - filter_mode: "file" + filter_mode: "diff_context" misspell: name: runner / misspell diff --git a/.gitignore b/.gitignore index 93be721ef..fe4575abd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ env.txt docker_env.txt pyenv.txt -augur_export_env.sh *DS_Store *.config.json !docker.config.json @@ -20,7 +19,6 @@ __pycache__/ *.pyc *.rdb yarn.lock -Augur_Muskellunge.egg-info/ runtime/ logs/ cache/ @@ -148,24 +146,10 @@ dmypy.json # Pyre type checker .pyre/ -# Augur-spdx -./spdx-scanner/dosocs2.conf -spdx-scanner/dosocs2.conf -spdx-scanner/augur-spdx/ -workers/spdx_worker/dosocs2.conf -workers/spdx_worker/augur-spdx/ -workers/spdx_worker/ex-raw.txt -workers/spdx_worker/ex.json - # Model files for machine learning -workers/message_insights_worker/message_models/*.h5 .h5 *.h5 -workers/clustering_worker/kmeans_repo_messages -workers/clustering_worker/lda_model -workers/clustering_worker/vocabulary -workers/clustering_worker/vocabulary_count - +*.model # compressed files *.gz @@ -187,10 +171,3 @@ nohup.out # local db volume pgdata/ postgres-data/ - -# Generated files from github -.history/sendgrid.env -sendgrid.env -*sendgrid*.env -./sendgrid.env -sendgrid.env diff --git a/.pylintrc b/.pylintrc index f18952423..be1b5632b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -51,7 +51,7 @@ ignore=CVS # Add files or directories matching the regex patterns to the ignore-list. The # regex matches against paths and can be in Posix or Windows format. -ignore-paths=./augur/application.db/models/,./augur/tasks/git/util/facade_worker/,./augur/application/schema/alembic/versions/,./augur/api/routes/,./augur/api/metrics/,./augur/api/config.py, ./augur/application/log_analysis/,./augur/augurface/ +ignore-paths=./collectoss/application.db/models/,./collectoss/tasks/git/util/facade_worker/,./collectoss/application/schema/alembic/versions/,./collectoss/api/routes/,./collectoss/api/metrics/,./collectoss/api/config.py, ./collectoss/application/log_analysis/ # Files or directories matching the regex patterns are skipped. The regex # matches against base names, not paths. The default value ignores Emacs file diff --git a/CITATION.cff b/CITATION.cff deleted file mode 100644 index 01514fb22..000000000 --- a/CITATION.cff +++ /dev/null @@ -1,14 +0,0 @@ -# This CITATION.cff reference content was generated from Zotero. -cff-version: 1.2.0 -message: "If you use this software, please cite it as below." -authors: - - family-names: Goggins - given-names: Sean - - family-names: Lumbard - given-names: Kevin - - family-names: Germonprez - given-names: Matt -title: "Open Source Community Health: Analytical Metrics and Their Corresponding Narratives" -doi: 10.1109/SoHeal52568.2021.00010 -date-released: 2021-01-01 -url: https://www.seangoggins.net/wp-content/plugins/zotpress/lib/request/request.dl.php?api_user_id=655145&dlkey=HNG22ZSU&content_type=application/pdf diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4df2a8dff..3d9182b26 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,131 +1,75 @@ # How to Contribute -We love to pull requests from everyone! We follow the standard Git workflow of `fork -> change -> pull request -> merge -> update fork -> change ... (repeat forever)`. - -If you are new to open source, we recommend GitHub's excellent guide on "[How to Contribute to Open Source](https://opensource.guide/how-to-contribute/)". In addition, please feel free to reach out to any of the maintainers or other community members if you are struggling as we are here to help you learn! - -Before getting started, please make sure you've read the [README](README.md) to get a primer on our project. Augur's documentation can be found [here](https://oss-augur.readthedocs.io/en/main/). - ## Join the Community +We have a public Slack channel in the CHAOSS workspace, as well as public meetings. -We encourage all contributors to join the [CHAOSS Slack workspace](https://chaoss.community/kb-getting-started/) and participate in the `#wg-augur-8knot` channel. This is a great place to ask questions, get help with issues, participate in discussions, and stay updated on community meetings and planning. Don't hesitate to introduce yourself and ask for help if you get stuck! +We encourage all contributors to join the [CHAOSS Slack workspace](https://chaoss.community/kb-getting-started/) and participate in the `#wg-collectoss-8knot` channel. Our meeting tumes are kept up to date in the Software section of the [CHAOSS Calendar](https://chaoss.community/chaoss-calendar/). We recommend subscribing to the CHAOSS Software calendar so you can automatically stay up to date with any schedule or timezone changes. If you can't attend these meetings, they are also recorded and made available on the [CHAOSS YouTube](https://www.youtube.com/@CHAOSStube). -## Opening an issue -If you're experiencing an issue with Augur or have a question you'd like help answering, please feel free to open an [issue](https://github.com/chaoss/augur/issues). To help us prevent duplicates, we kindly ask that you briefly search for your problem or question in our [issues](https://github.com/chaoss/augur/issues) before opening a new one. +These resources are a great way to meet the people behind the project, ask questions, get help, participate in discussions, and stay updated on community meetings and planning. Everyone is welcome, so feel free to introduce yourself and ask for help if you get stuck or feel frustrated with any part of this setup process! -Please note that if you open a bug report and your issue **does not** follow our template, we cannot help you until you have provided us all the relevant information in that format. -Respectfully, we do not have the time to try and recreate an error given with minimal or no context, so by providing this information you are helping us help you! +## Learn about the project -### How to submit a bug report -To see the template referred to in the above section, click on **New Issue**, then click on the **Get Started** button on the **Bug Report** option. -A dialogue box populated with descriptions of what to put in each section, will pop up on a new page. -Kindly replace the descriptions with your comments to the best of your ability, and please include screenshots and error logs if applicable. +If you aren't already familiar with what CollectOSS is, please make sure you've read the [README](README.md) to get a primer on our project, and maybe take a look around the [documentation](https://collectoss.readthedocs.io/en/release/) so you know what we are about. You can also hang out in Slack or join our community meetings to learn more about what we do. -file1 +## Opening an issue +If you're experiencing an issue with CollectOSS you can search for your problem or question on our [issues](https://github.com/chaoss/collectoss/issues) page to see if someone else has already reported it. If you cannot find your issue, please feel free to [open a new one](https://github.com/chaoss/collectoss/issues/new/choose). -file2 +If you are new to opening issues, we recommend [opensource.guide](https://opensource.guide/how-to-contribute) and their section on [Opening Issues](https://opensource.guide/how-to-contribute/#opening-an-issue). -file3 +> [!TIP] +> Filling our our issue templates will help us gather all the necessary information to troubleshoot your issue efficiently. Issues that are missing details may take longer to be fixed. ## Contributing to the source code +We welcome pull requests from anyone! -1. Fork and clone this repo: -```bash -$ git clone github.com:your-username/augur.git -$ cd augur/ -$ git remote add upstream https://github.com/chaoss/augur.git -``` +We follow the same GitHub workflow that most other projects on GitHub follow: Fork -> create a branch -> make a pull request -> repeat. -2. Follow the [development installation instructions](https://github.com/chaoss/augur/blob/main/docs/new-install.md). - -3. Create a new branch -```bash -$ git checkout -b my-new-branch -``` - -4. Make your change(s). - -5. Commit the change(s) and push to your fork -```bash -$ git add . -$ git commit -s -m "This is my first commit" -$ git push -u origin my-new-branch -``` -6. Then, [submit a pull request](https://github.com/chaoss/augur/compare). +Detailed instructions for making your contribution under this workflow can be found on the [GitHub Flow page](https://docs.github.com/en/get-started/using-github/github-flow). There is also an opensource.guide section on [making pull requests](https://opensource.guide/how-to-contribute/#opening-a-pull-request). If you get stuck, please ask for help in the project Slack. -At this point, you're waiting on us. We like to at least comment on pull requests within three business days (and, typically, one business day). -Once one of our maintainers has had a chance to review your PR, we will either mark it as ```needs review``` and provide specific feedback on your changes, or we will go ahead and complete the pull request. +### Signing-off on Commits +To contribute to this project, you must agree to the [Developer Certificate of Origin](https://developercertificate.org/) (DCO) for each commit you make. The DCO is a simple statement that you, as a contributor, have the legal right to make the contribution. It is NOT a copyright assignment or transfer. This certification is required for contributions to CHAOSS repositories by the [CHAOSS charter](https://chaoss.community/about/charter/#user-content-8-intellectual-property-policy). -## Signing-off on Commits -To contribute to this project, you must agree to the [Developer Certificate of Origin](https://developercertificate.org/) (DCO) by the [CHAOSS charter](https://chaoss.community/about/charter/#user-content-8-intellectual-property-policy) for each commit you make. The DCO is a simple statement that you, as a contributor, have the legal right to make the contribution. To signify that you agree to the DCO for contributions, you simply add a line to each of your git commit messages. For example: ``` Signed-off-by: Jane Smith ``` -This can be easily done by using the `-s` flag when running the `git commit` command, +This can be easily done by using the `-s` flag when running the `git commit` command: `git commit -s -m "my commit message"` -``` -$ git commit -s -m “my commit message w/signoff” -``` - -To ensure all your commits are signed, you may choose to [configure git](https://gist.github.com/xavierfoucrier/c156027fcc6ae23bcee1204199f177da) properly by editing your global ```.gitconfig``` - -**Any pull requests containing commits that are not signed off will not be eligible for merge until the commits have been signed off.** - -## Keeping in sync with the Augur Repository +The PiHole project has more detailed guide on adding this signoff to your commits can be found on their ["How to sign-off commits"](https://docs.pi-hole.net/guides/github/how-to-signoff/) page. -Remember to sync your fork with the ```main``` branch regularly, by taking the following steps: +> [!TIP] +> Signing off commits is slightly easier and safer if you do it before you push your changes to GitHub. -- Setup your upstream branch to point to the URL of the main Augur repo ```https://github.com/chaoss/augur.git```. - -- Next, in the root folder of the project, on the ```main``` branch, run: -``` -git remote add upstream https://github.com/chaoss/augur.git -``` -Whenever you need to make changes, make sure your ```main``` branch is in sync with the main repository, by checking out to the ```main``` branch and running: -``` -git pull upstream main -git push origin master -``` +### Keeping in sync with the CollectOSS Repository +As we merge code from maintainers and other contributors, the fork that you contribute will likely start falling behind the `main` branch of CollectOSS. To make sure you are contributing on top of a new enough version of the code, make sure you are occasionally keeping your fork in sync. -## Community Resources +Github has an article called [Syncing a fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork) that walks through several different ways to keep your fork up to date. -### Augur -- [Stable documentation (`release` branch)](https://oss-augur.readthedocs.io/en/release/) -- [Nightly/developer build documentation (`main` branch)](https://oss-augur.readthedocs.io/en/main/) (warning: this is should be considered an unstable branch and should not be used for production) -- [Live Augur demo](https://ai.chaoss.io) +> [!TIP] +> Making a new branch for each contribution will make it easier to keep your `main` branch in sync with the project. -### CHAOSS -- [Website](https://chaoss.community/) -- [Get Involved](https://chaoss.community/participate) -- [Join the CHAOSS Slack](https://chaoss.community/kb-getting-started/) - Join the `#wg-augur-8knot` channel to participate in discussions, meetings, and planning -- [Metrics](https://github.com/chaoss/metrics) -- [Evolution Metrics Working Group](https://github.com/chaoss/wg-evolution) -- [Common Metrics Working Group](https://github.com/chaoss/wg-common) -- [Risk Metrics Working Group](https://github.com/chaoss/wg-risk) -- [Value Metrics Working Group](https://github.com/chaoss/wg-value) -- [Diversity & Inclusion Metrics Working Group](https://github.com/chaoss/wg-diversity-inclusion) +## Helpful Links -## Technical Resources +- [CollectOSS stable documentation](https://collectoss.readthedocs.io/en/release/) +- [CHAOSS Getting Started page](https://chaoss.community/kb-getting-started/) -### Git & GitHub +**Git & GitHub** - [How to contribute to Open Source](https://opensource.guide/how-to-contribute/) - [GitHub's Git Handbook](https://guides.github.com/introduction/git-handbook/) - [GitHub's "Hello World" tutorial](https://guides.github.com/activities/hello-world/) - [Understanding the GitHub Flow](https://guides.github.com/introduction/flow/) -- [Commit message style guidelines](https://commit.style/) - [No-nonsense Git reference](https://rogerdudler.github.io/git-guide/) (best to have a cursory understanding of Git before hand) -### Python guides +**Python guides** - [Python's official tutorial](https://docs.python.org/3/tutorial/index.html) - [Python's official style guide](https://www.python.org/dev/peps/pep-0008/) - [Python best practices](https://gist.github.com/sloria/7001839) - [The Zen of Python](https://www.python.org/dev/peps/pep-0020/) -### PostgreSQL guides +**PostgreSQL guides** - [PostgreSQL installation guide](https://www.postgresql.org/docs/12/tutorial-install.html) - [PostgreSQL official tutorial](https://www.postgresql.org/docs/) - [PostgreSQL docker official image](https://hub.docker.com/_/postgres) diff --git a/CONTRIBUTORS.md b/CREDITS.md similarity index 66% rename from CONTRIBUTORS.md rename to CREDITS.md index 8599944b0..238478d63 100644 --- a/CONTRIBUTORS.md +++ b/CREDITS.md @@ -1,88 +1,79 @@ -# Contributors & Participants - -This file contains full attribution lists for: -- Current maintainers -- Founding Maintainers -- Former maintainers -- Contributors -- Google Summer of Code participants (by year) - ---- -## Current Maintainers -- Sean P. Goggins — [@sgoggins](https://github.com/sgoggins) -- Adrian Edwards — [@MoralCode](https://github.com/MoralCode) -- Andrew Brain — [@ABrain7710](https://github.com/ABrain7710) -- Isaac Milarsky — [@IsaacMilarky](https://github.com/IsaacMilarky) -- John McGinness — [@Ulincys](https://github.com/Ulincsys) - ---- - -## Founding Maintainers -- Derek Howard — [@howderek](https://github.com/howderek) - -## Former Maintainers -- Carter Landis — [@ccarterlandis](https://github.com/ccarterlandis) -- Gabe Heim — [@gabe-heim](https://github.com/gabe-heim) -- Matt Snell — [@Nebrethar](https://github.com/Nebrethar) -- Christian Cmehil-Warn — [@christiancme](https://github.com/christiancme) -- Jonah Zukosky — [@jonahz5222](https://github.com/jonahz5222) -- Carolyn Perniciaro — [@CMPerniciaro](https://github.com/CMPerniciaro) -- Elita Nelson — [@ElitaNelson](https://github.com/ElitaNelson) -- Michael Woodruff — [@michaelwoodruffdev](https://github.com/michaelwoodruffdev/) -- Max Balk — [@maxbalk](https://github.com/maxbalk/) - ---- - -## Contributors -- [Dawn Foster](https://github.com/geekygirldawn) -- [Ivana Atanasova](https://github.com/ivanayov) -- [Georg J.P. Link](https://github.com/GeorgLink) -- [Gary P. White](https://github.com/garypwhite) - ---- - -## GSoC 2025 Participants -- [Akshat Baranwal](https://github.com/akshatb2006) -- [Asish Kumar](https://github.com/officialasishkumar) -- [Jiahong Lin](https://github.com/xiaoha-cloud) - ---- - -## GSoC 2022 Participants -- [Kaxada](https://github.com/kaxada) -- [Mabel F](https://github.com/mabelbot) -- [Priya Srivastava](https://github.com/Priya730) -- [Ramya Kappagantu](https://github.com/RamyaKappagantu) -- [Yash Prakash](https://gist.github.com/yash-yp) - ---- - -## GSoC 2021 Participants -- [Dhruv Sachdev](https://github.com/Dhruv-Sachdev1313) -- [Rashmi K A](https://github.com/Rashmi-K-A) -- [Yash Prakash](https://gist.github.com/yash-yp) -- [Anuj Lamoria](https://github.com/anujlamoria) -- [Yeming Gu](https://github.com/gymgym1212) -- [Ritik Malik](https://gist.github.com/ritik-malik) - ---- - -## GSoC 2020 Participants -- [Akshara P](https://github.com/aksh555) -- [Tianyi Zhou](https://github.com/tianyichow) -- [Pratik Mishra](https://github.com/pratikmishra356) -- [Sarit Adhikari](https://github.com/sarit-adh) -- [Saicharan Reddy](https://github.com/mrsaicharan1) -- [Abhinav Bajpai](https://github.com/abhinavbajpai2012) - ---- - -## GSoC 2019 Participants -- [Bingwen Ma](https://github.com/bing0n3) -- [Parth Sharma](https://github.com/parthsharma2) - ---- - -## GSoC 2018 Participants -- [Keanu Nichols](https://github.com/kmn5409) - +# Credits + +This file aims to provide credit to anyone who has contributed to the CollectOSS project, including financial supporters and contributors to the Augur project, upon which CollectOSS is based. + +## CollectOSS Contributors + +### Maintainers +The list of current CollectOSS maintainers can be found in the [MAINTAINERS](./MAINTAINERS.md) file. + + + +## Augur Contributors + +Augur has been supported by the University of Missouri through funding provided by the Alfred P. Sloan Foundation, Mozilla, The Reynolds Journalism Institute with contributions from VMWare, Red Hat LLC, Grace Hopper's Open Source Day, GitHub, Microsoft, Twitter, Adobe, the Gluster Project, Open Source Summit (NA/Europe), and the Linux Foundation Compliance Summit. + +Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien, Remy Decausemaker, Google Summer of Code Students, and others including: + +### Maintainers +- Sean P. Goggins — [@sgoggins](https://github.com/sgoggins) +- Adrian Edwards — [@MoralCode](https://github.com/MoralCode) +- Andrew Brain — [@ABrain7710](https://github.com/ABrain7710) +- Isaac Milarsky — [@IsaacMilarky](https://github.com/IsaacMilarky) +- John McGinness — [@Ulincys](https://github.com/Ulincsys) + +### Founding Maintainers +- Derek Howard — [@howderek](https://github.com/howderek) + +### Former Maintainers +- Carter Landis — [@ccarterlandis](https://github.com/ccarterlandis) +- Gabe Heim — [@gabe-heim](https://github.com/gabe-heim) +- Matt Snell — [@Nebrethar](https://github.com/Nebrethar) +- Christian Cmehil-Warn — [@christiancme](https://github.com/christiancme) +- Jonah Zukosky — [@jonahz5222](https://github.com/jonahz5222) +- Carolyn Perniciaro — [@CMPerniciaro](https://github.com/CMPerniciaro) +- Elita Nelson — [@ElitaNelson](https://github.com/ElitaNelson) +- Michael Woodruff — [@michaelwoodruffdev](https://github.com/michaelwoodruffdev/) +- Max Balk — [@maxbalk](https://github.com/maxbalk/) + +### Contributors +- [Dawn Foster](https://github.com/geekygirldawn) +- [Ivana Atanasova](https://github.com/ivanayov) +- [Georg J.P. Link](https://github.com/GeorgLink) +- [Gary P. White](https://github.com/garypwhite) +- [Shlok Gilda](https://github.com/shlokgilda) + +### GSoC 2025 Participants +- [Akshat Baranwal](https://github.com/akshatb2006) +- [Asish Kumar](https://github.com/officialasishkumar) +- [Jiahong Lin](https://github.com/xiaoha-cloud) + +### GSoC 2022 Participants +- [Kaxada](https://github.com/kaxada) +- [Mabel F](https://github.com/mabelbot) +- [Priya Srivastava](https://github.com/Priya730) +- [Ramya Kappagantu](https://github.com/RamyaKappagantu) +- [Yash Prakash](https://gist.github.com/yash-yp) + +### GSoC 2021 Participants +- [Dhruv Sachdev](https://github.com/Dhruv-Sachdev1313) +- [Rashmi K A](https://github.com/Rashmi-K-A) +- [Yash Prakash](https://gist.github.com/yash-yp) +- [Anuj Lamoria](https://github.com/anujlamoria) +- [Yeming Gu](https://github.com/gymgym1212) +- [Ritik Malik](https://gist.github.com/ritik-malik) + +### GSoC 2020 Participants +- [Akshara P](https://github.com/aksh555) +- [Tianyi Zhou](https://github.com/tianyichow) +- [Pratik Mishra](https://github.com/pratikmishra356) +- [Sarit Adhikari](https://github.com/sarit-adh) +- [Saicharan Reddy](https://github.com/mrsaicharan1) +- [Abhinav Bajpai](https://github.com/abhinavbajpai2012) + +### GSoC 2019 Participants +- [Bingwen Ma](https://github.com/bing0n3) +- [Parth Sharma](https://github.com/parthsharma2) + +### GSoC 2018 Participants +- [Keanu Nichols](https://github.com/kmn5409) diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 000000000..bdd6f2560 --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,151 @@ +# CollectOSS Project Governance + +CollectOSS is dedicated to building and improving a data collection, transformation, and serving backend for open source contributor activity. CollectOSS will operate as a project within the CHAOSS Organization, which is a member of the Linux Foundation. This governance explains how the project is run. + +- [Values](#values) +- [Maintainers](#maintainers) +- [Becoming a Maintainer](#becoming-a-maintainer) +- [Meetings](#meetings) +- [CHAOSS Resources](#CHAOSS-resources) +- [Code of Conduct Enforcement](#code-of-conduct) +- [Security Response Team](#security-response-team) +- [Voting](#voting) +- [Modifications](#modifying-this-charter) + +## Values + +The CollectOSS and its leadership embrace the following values: + +* Openness: Communication and decision-making happens in the open and is discoverable for future + reference. As much as possible, all discussions and work take place in public + forums and open repositories. + +* Fairness: All stakeholders have the opportunity to provide feedback and submit + contributions, which will be considered on their merits. + +* Inclusivity: We innovate through different perspectives and skill sets, which + can only be accomplished in a welcoming and respectful environment. + +* Participation: Responsibilities within the project are earned through + participation, and there is a clear path up the contributor ladder into leadership + positions. + +## Maintainers + +The current maintainers can be found in [MAINTAINERS.md](./MAINTAINERS.md). Maintainers collectively manage the project's resources and contributors. + +CollectOSS Maintainers have merge approval rights to the [project GitHub repository](https://github.com/chaoss/collectoss) and all other CollectOSS project repositories. + +This privilege is granted with some expectation of responsibility: maintainers +are people who care about the CollectOSS project and want to help it grow and +improve. A maintainer is not just someone who can make changes, but someone who +has demonstrated their ability to collaborate with the team, get the most +knowledgeable people to review code and docs, contribute high-quality code, and +follow through to fix issues (in code or tests). + +A maintainer is a contributor to the project's success and a citizen helping +the project succeed. + +The collective team of all Maintainers is known as the Maintainer Council, which +is the governing body for the project. + +### Becoming a Maintainer + +To become a Maintainer you need to demonstrate the following: + + * commitment to the project: + * participate in discussions, contributions, code and documentation reviews + for 6 months or more, + * perform reviews for at least 4 non-trivial pull requests, + * contribute at least 3 non-trivial pull requests and have them merged, + * ability to write quality code and/or documentation, + * ability to collaborate with the team, + * understanding of how the team works (policies, processes for testing and code review, etc), + * understanding of the project's code base and coding and documentation style, + * dedication to maintaining CollectOSS as a shared project for the CHAOSS community. + +A new Maintainer must be proposed by an existing Maintainer by posting an issue in the project repository. A simple majority vote of existing Maintainers approves the application. Maintainers nominations will be evaluated without prejudice to employer or demographics. + +Maintainers who are selected will be granted the necessary GitHub rights, +and invited to the private maintainer slack channel. + +### Removing a Maintainer + +Maintainers may resign at any time if they feel that they will not be able to +continue fulfilling their project duties. + +Maintainers may also be removed after being inactive, failure to fulfill their +Maintainer responsibilities, violating the Code of Conduct, or other reasons. +Inactivity is defined as a period of very low or no activity in the project +for a year or more, with no definite schedule to return to full Maintainer +activity. + +A Maintainer may be removed at any time by a 2/3 vote of the remaining maintainers. + +Depending on the reason for removal, a Maintainer may be converted to Emeritus +status. Emeritus Maintainers will still be consulted on some project matters, +and can be rapidly returned to Maintainer status if their availability changes. + +## Meetings + +Time zones permitting, Maintainers are expected to participate in the public +developer meeting, which occurs every two weeks according to the CHAOSS calendar. + +Maintainers will also have closed meetings in order to discuss security reports +or reports from the CHAOSS Code of Conduct Committee. Such meetings should be scheduled by any Maintainer on receipt of a security issue or CoCC message. All current Maintainers must be invited to such closed meetings, except for any Maintainer who is accused of a CoC violation. + +## CHAOSS Resources + +Any Maintainer may suggest a request for CHAOSS resources, either in an issue, or during a meeting. A simple majority of Maintainers approves the request. The Maintainers may also choose to delegate working with CHAOSS to non-Maintainer community members, who will then be added to the [MAINTAINERS.md file](./MAINTAINERS.md) with that special status. + +## Code of Conduct Committee + +The CollectOSS project adheres to the [CHAOSS Code of Conduct](https://chaoss.community/code-of-conduct/)(CoC). As such, community members needing to report a violation of the CoC should report it directly to the CHAOSS Code Of Conduct Committee (CoCC). + +The Maintainers will work with the CoCC on any reports which require action by the project. + +## Security Response Team + +The Maintainers will appoint a Security Response Team to handle security reports. +This committee may simply consist of the Maintainer Council themselves. If this +responsibility is delegated, the Maintainers will appoint a team of at least two +contributors to handle it. The Maintainers will review who is assigned to this +at least once a year. + +The Security Response Team is responsible for handling all reports of security +holes and breaches according to the [security policy](./SECURITY.md). + +## Voting + +While most business in CollectOSS is conducted by "[lazy consensus](https://community.apache.org/committers/lazyConsensus.html)", +periodically the Maintainers may need to vote on specific actions or changes. +A vote can be taken on the project's public Slack channel (#wg-collectoss-8knot in the [CHAOSS Slack](https://chaoss.community/kb-getting-started/)) or +the private Maintainer Slack channel for security or conduct matters. +Votes may also be taken at the biweely developer meeting. Any Maintainer may +demand a vote be taken. + +Most votes require a simple majority of all Maintainers to succeed, except where +otherwise noted. Two-thirds majority votes mean at least two-thirds of all +existing maintainers. + +## Transitional Period + +There will be a Transitional Period for six to eight months after the CollectOSS project is launched. During that transitional period, the project governance will be modified in the following ways in order to build a new Maintainer Council. + +The project will be governed by the [Transitional Maintainers](./MAINTAINERS.md), who may or may not meet the standard qualifications for a Maintainer. +* The Transitional Maintainers will be seeking to appoint new Maintainers based on an optimitistic and flexible evaluation of their contributions during the first months of the project. This will generally include "crediting" contributors for contributions made to the Augur project. +* Newly appointed Maintainers do not need to meet the full qualifications for Maintainer above (particularly the 6 month requirement), and will be approved by a fast-track process. + +The following will happen at the end of the Transitional Period: + +1. The Maintainers to date will vote to end the Transition. +2. The Maintainers will update the Maintainer requirements based on the early project experience. +3. Any Transitional Maintainers who do not qualify as, or do not wish to be, long-term Maintainers will step down, and the remaining ones will be converted to long-term Maintainers. +4. This section will then be removed from the Governance. + +## Modifying this Charter + +Changes to this Governance and its supporting documents may be approved by +a 2/3 vote of the Maintainers. + +This governance document was created based on the template available at https://github.com/cncf/project-template/blob/main/GOVERNANCE-maintainer.md diff --git a/LICENSE b/LICENSE index 2f389d8d3..90c664b94 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 Matt Germonprez, Sean Goggins, Gabe Heim, Derek Howard, Carter Landis, Matt Snell, Brian Warner, University of Nebraska at Omaha, and the University of Missouri +Copyright (c) CHAOSS and the collective contributors to the CollectOSS and Augur projects. A full listing of all contributors may be found at https://github.com/chaoss/collectoss/blob/main/CREDITS.md Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 000000000..7a68ba12d --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,20 @@ +## CollectOSS Project Maintainers + +The current Maintainers for the CollectOSS Project consists of: + +| Name | Employer | Responsibilities | +| ----------- | ------- | -------------------------------- | +| [Adrian Edwards](https://github.com/MoralCode) | Red Hat | Transitional Maintainer | +| [Shlok Gilda](https://github.com/shlokgilda) | University of Florida | Transitional Maintainer | +| [Cali Dolfi](https://github.com/cdolfi) | Red Hat | Transitional Maintainer | +| [Josh Berkus](https://github.com/jberkus) | Red Hat | Transitional Maintainer | + +See [the project Governance](GOVERNANCE.md) for how maintainers are selected and replaced, and the rules around the Transitional Maintainer status. + +## Emeritus Maintainers + +The following people have helped by leading and contributing to the CollectOSS project, but have reduced their level of involvement, or stepped aside so that others may step up in their place. + +| Name | Date of Retirement | +| ---- | ------------------ | +| [Name Here](https://github.com/ghhandle) | Date Here | diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100755 index b0d6b1ab4..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include LICENSE -include metadata.py -include schema/ -include scripts/ diff --git a/Makefile b/Makefile index c00d789fa..18ea51a7f 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,6 @@ #SPDX-License-Identifier: MIT default: - @ echo "Installation Commands:" - @ echo " install Installs Augur's full stack for production" - @ echo " wizard Install Augur and launch the graphical setup wizard" - @ echo " clean Removes potentially troublesome compiled files" - @ echo " rebuild Removes build/compiled files & binaries and reinstalls the project" - @ echo - @ echo "Development Commands:" - @ echo " db Initialize a fresh Docker database container (restarts it if it's still running)" - @ echo " dev Starts the full stack in the background" - @ echo " dev-start Runs the backend and frontend servers in the background" - @ echo " dev-stop Stops the backgrounded backend & frontend server commands" - @ echo @ echo "Testing Commands:" @ echo " test-data Start the testing dataset Docker database" @ echo " test Runs all tests" @@ -22,69 +10,17 @@ default: @ echo " docs Generates the documentation" @ echo " docs-view Generates the documentation, then opens it for local viewing" -# @ echo " test-application Runs all application unit tests (including metrics)" -# @ echo " test-workers Run all worker unit tests" - - -# -# Installation -# -.PHONY: install -.PHONY: install-spdx install-spdx-sudo install-augur-sbom -.PHONY: clean rebuild -install: uv - @ uv run ./scripts/install/install.sh dev - -wizard: - @ ./scripts/install/install.sh graphical - -install-spdx: - @ ./scripts/install/install-spdx.sh - -install-spdx-sudo: - @ ./scripts/install/install-spdx-sudo.sh - -install-augur-sbom: - @ ./scripts/install/nomos.sh - -clean: - @ scripts/control/clean.sh - -rebuild: uv - @ uv run scripts/control/rebuild.sh dev - -# -# Development -# -.PHONY: dev-start dev-stop dev monitor-frontend monitor-backend monitor frontend backend-stop backend-start backend-restart backend clean rebuild - -dev-start: - @ scripts/control/start_augur.sh - @ scripts/control/start_frontend.sh - -dev-stop: - @ augur backend stop - @ scripts/control/kill_frontend.sh - -dev: dev-stop dev-start - -db: - @ - docker stop augur_database - @ - docker rm augur_database - @ docker run -p 5434:5432 --name augur_database augurlabs/augur:database - - lint: - @ pylint augur + @ pylint collectoss lint-count: - @ pylint augur | wc -l + @ pylint collectoss | wc -l lint-docs: - @ pylint augur | grep docstring + @ pylint collectoss | grep docstring lint-docs-missing: - @ pylint augur | grep docstring | wc -l + @ pylint collectoss | grep docstring | wc -l lint-github-tasks-count: - @ pylint augur | grep augur/tasks/github/ | wc -l + @ pylint collectoss | grep collectoss/tasks/github/ | wc -l # # Testing @@ -92,9 +28,9 @@ lint-github-tasks-count: .PHONY: test test-data test-application test-metric-routes test-python-versions test-data: - @ - docker stop augur_test_data - @ - docker rm augur_test_data - @ docker run -p 5434:5432 --name augur_test_data augurlabs/augur:test_data@sha256:71da12114bf28584a9a64ede2fac0cbc8dffc8e2f4a2c61231206e2f82201c2f + @ - docker stop test_data + @ - docker rm test_data + @ docker run -p 5434:5432 --name test_data augurlabs/augur:test_data@sha256:71da12114bf28584a9a64ede2fac0cbc8dffc8e2f4a2c61231206e2f82201c2f test: # @ pytest tests/test_tasks/test_github_tasks/ @@ -109,19 +45,7 @@ test-api: -# test-application: -# @ bash -c 'tox -e py-application' - -#Worker's tests need a database from docker -#To use the docker daemon you need to be root so sudo is needed. -# test-workers: -# @ bash -c 'sudo tox -e py-workers' - -# test-metric-routes: -# @ bash -c 'tox -e py-metric-routes' -# test-python-versions: -# @ bash -c 'tox -e ALL' # @@ -141,55 +65,3 @@ docs: uv docs-view: docs @ bash -c 'open docs/build/html/index.html' - - -# -# Docker Shortcuts -# Do not use these unless you know what they mean. -.PHONY: compose-run compose-run-database -.PHONY: build-backend run-backend build-frontend run-frontend build-database run-database - - -compose-run: - @ docker compose -f docker-compose.yml up --build - -compose-run-database: - @ echo "**************************************************************************" - @ echo "Make sure there are no database credentials in docker_env.txt!" - @ echo "**************************************************************************" - @ echo - @ docker compose -f docker-compose.yml -f database-compose.yml up --build - -docker-build: docker-build-backend docker-build-frontend docker-build-database docker-build-rabbitmq - -docker-build-backend: - @ docker build -t augurlabs/augur:backend -f util/docker/backend/Dockerfile . - -docker-build-frontend: - @ docker build -t augurlabs/augur:frontend -f util/docker/frontend/Dockerfile . - -docker-build-database: - @ docker build -t augurlabs/augur:database -f util/docker/database/Dockerfile . - -docker-build-rabbitmq: - @ docker build -t augurlabs/augur:rabbitmq -f util/docker/rabbitmq/Dockerfile . - -docker-run-backend: - @ - docker stop augur_backend - @ - docker rm augur_backend - docker run -p 5000:5000 --name augur_backend --env-file docker_env.txt augurlabs/augur:backend - -docker-run-frontend: - @ - docker stop augur_frontend - @ - docker rm augur_frontend - docker run -p 8080:8080 --name augur_frontend augurlabs/augur:frontend - -docker-run-database: - @ - docker stop augur_database - @ - docker rm augur_database - docker run -p 5434:5432 --name augur_database augurlabs/augur:database - -docker-run-rabbitmq: - @ - docker stop augur_rabbitmq - @ - docker rm augur_rabbitmq - docker run -p 5434:5432 --name augur_rabbitmq augurlabs/augur:rabbitmq diff --git a/README.md b/README.md index 0a0ab7dea..ee4782dce 100644 --- a/README.md +++ b/README.md @@ -1,95 +1,55 @@ -# Augur NEW Release v0.92.0 +# CollectOSS -Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else! -The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io). +[![Build Docker images](https://github.com/chaoss/collectoss/actions/workflows/build_docker.yml/badge.svg)](https://github.com/chaoss/collectoss/actions/workflows/build_docker.yml) [![Hits-of-Code](https://hitsofcode.com/github/chaoss/collectoss?branch=release)](https://hitsofcode.com/github/chaoss/collectoss/view?branch=release) -[![first-timers-only](https://img.shields.io/badge/first--timers--only-friendly-blue.svg?style=flat-square)](https://www.firsttimersonly.com/) -We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy of tagging issues for first timers only, and walking one newcomer through the resolution process weekly. You can find these issues tagged with [first timers only](https://github.com/chaoss/augur/labels/first-timers-only) on our issues list. +## What is CollectOSS? +CollectOSS is a software suite for collecting structured data +about [free](https://www.fsf.org/about/) and [open-source](https://opensource.org/docs/osd) software (FOSS) communities via git forges. -[![standard-readme compliant](https://img.shields.io/badge/standard--readme-OK-green.svg?style=flat-square)](https://github.com/RichardLitt/standard-readme) [![Build Docker images](https://github.com/chaoss/augur/actions/workflows/build_docker.yml/badge.svg)](https://github.com/chaoss/augur/actions/workflows/build_docker.yml) [![Hits-of-Code](https://hitsofcode.com/github/chaoss/augur?branch=release)](https://hitsofcode.com/github/chaoss/augur/view?branch=release) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/2788/badge)](https://bestpractices.coreinfrastructure.org/projects/2788) +CollectOSS's main focus is to measure the overall health and sustainability of open source projects, as these types of projects are system critical for nearly every software organization or company. -## NEW RELEASE ALERT! -**If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**. +The data CollectOSS collects covers more than just code contributions and extends to anything that can be derived from forge data, including comments, change reviews, releases, and other project activity or interactions. This data is stored in a relational database (PostgreSQL), enabling large-scale data aggregation across any number of repositories to provide context about the way these communities evolve. -Augur is now releasing a dramatically improved new version. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.92.0). +CollectOSS is part of [CHAOSS](https://chaoss.community), which is a Linux Foundation® project. Many of our metrics are implementations of the [metrics](https://chaoss.community/metrics/) defined by the CHAOSS community. +## Versions and support +CollectOSS is a Python project distributed via container images and aims to support all currently-supported versions of Python on macOS and Linux platforms. Docker is the primary supported container runtime, but Podman is also supported and used by some maintainers, although it requires configuring some extra permissions to run correctly. -- The `release` branch is a stable version of our new architecture, which features: - - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. - - A new job management architecture that uses Celery and Redis to manage queues, and enables users to run a Flower job monitoring dashboard. - - Materialized views to increase the snappiness of API’s and Frontends on large scale data. - - Changes to primary keys, which now employ a UUID strategy that ensures unique keys across all Augur instances. - - Support for [8knot](https://github.com/oss-aspen/8knot) dashboards (view a sample [here](https://eightknot.osci.io/)). - *beautification coming soon!* - - Data collection completeness assurance enabled by a structured, relational data set that is easily compared with platform API Endpoints. -- The next release of the new version will include a hosted version of Augur where anyone can create an account and add repos *they care about*. -If the hosted instance already has a requested organization or repository it will be added to a user’s view. If its a new repository or organization, the user will be notified that collection will take (time required for the scale of repositories added). +Our `main` branch is our development branch that all pull requests should be based on. The `release` branch is where we merge and tag new versions and is the branch we recommend using in production. You can see tagged versions and corresponding release notes on the [releases page](https://github.com/chaoss/collectoss/releases). -## What is Augur? -Augur is a software suite for collecting and measuring structured data -about [free](https://www.fsf.org/about/) and [open-source](https://opensource.org/docs/osd) software (FOSS) communities. +## Installation +Basic initial setup can be completed in a few minutes as follows: -We gather trace data for a group of repositories, normalize it into our data model, and provide a variety of metrics about said data. The structure of our data model enables us to synthesize data across various platforms to provide meaningful context for meaningful questions about the way these communities evolve. +1. Clone the repository - `git clone https://github.com/chaoss/collectoss` +2. (optional) if you want to build the development version, run `docker compose build` +3. Copy the `environment.txt` file to a new file called `.env` and fill in values for the required variables +4. Run `docker compose up` to start the containers -Augur’s main focus is to measure the overall health and sustainability of open source projects, as these types of projects are system critical for nearly every software organization or company. We do this by gathering data about project repositories and normalizing that into our data model to provide useful metrics about your project’s health. +Check out the [CollectOSS Documentation](https://collectoss.readthedocs.io) for more detailed setup instructions and troubleshooting steps. -For example, one of our metrics is *burstiness*. Burstiness – how are short timeframes of intense activity, followed by a corresponding return to a typical pattern of activity, observed in a project? -This can paint a picture of a project’s focus and gain insight into the potential stability of a project and how its typical cycle of updates occurs. - -We are a [CHAOSS](https://chaoss.community) project, and many of our -metrics are implementations of the metrics defined by our awesome community. You can find a full list of them [here](https://chaoss.community/metrics/). - -For more information on [how to get involved on the CHAOSS website](https://chaoss.community/participate/). - -## Collecting Data - -Augur supports ```Python3.7``` through ```Python3.11``` on all platforms. ```Python3.12``` and above do not yet work because of machine learning worker dependencies. On OSX, you can create a ```Python3.11``` environment, by running: -``` -$ python3.11 -m venv path/to/venv -``` - -Augur's main focus is to measure the overall health and sustainability of open source projects. - -Augur collects more data about open source software projects than any other available software. Augur's main focus is to measure the overall health and sustainability of open source projects. - -One of Augur's core tenets is a desire to openly gather data that people can trust, and then provide useful and well-defined metrics that help give important context to the larger stories being told by that data. - -We do this in a variety of ways, one of which is doing all our own data collection in house. We currently collect data from a few main sources: - -1. Raw Git commit logs (commits, contributors) -2. GitHub's API (issues, pull requests, contributors, releases, repository metadata) -3. The Linux Foundation's [Core Infrastructure Initiative](https://www.coreinfrastructure.org/) API (repository metadata) -4. [Succinct Code Counter](https://github.com/boyter/scc), a blazingly fast Sloc, Cloc, and Code tool that also performs COCOMO calculations - -This data is collected by dedicated data collection workers controlled by Augur, each of which is responsible for querying some subset of these data sources. -We are also hard at work building workers for new data sources. If you have an idea for a new one, [please tell us](https://github.com/chaoss/augur/issues/new?template=feature_request.md) - we'd love your input! - - -## Getting Started +## Contributing +We strongly believe that communities are what makes open source so impactful. We invite you to join our community, regardless of your experience level or coding abilities! -If you're interested in collecting data with our tool, the Augur team has worked hard to develop a detailed guide to get started with our project which can be found [in our documentation](https://oss-augur.readthedocs.io/en/main/getting-started/toc.html). +Check out the [CHAOSS Getting Started guide](https://chaoss.community/kb-getting-started/) to join Slack and learn more about CHAOSS. After you arrive, we recommend: +- Joining the **#wg-collectoss-8knot** channel (or ask for help finding it) +- Subscribing to the CHAOSS Software meetings in your calendar using the links on the [CHAOSS Calendar](https://chaoss.community/chaoss-calendar/) page -If you're looking to contribute to Augur's code, you can find installation instructions, development guides, architecture references (coming soon), best practices and more in our [developer documentation](https://oss-augur.readthedocs.io/en/main/development-guide/toc.html). +Information about contribution guidelines, building from source, and testing can be found in our [CONTRIBUTING.md](CONTRIBUTING.md). -Please know that while it's still rather sparse right now, -but we are actively adding to it all the time. +## Who uses CollectOSS? -If you get stuck, please feel free to [ask for help](https://github.com/chaoss/augur/issues/new)! +CollectOSS metrics are used by many other visualization and metrics projects, such as: -## Contributing +- [8Knot](https://github.com/oss-aspen/8Knot) -To contribute to Augur, please follow the guidelines found in our [CONTRIBUTING.md](CONTRIBUTING.md) and the CHAOSS [Code of Conduct]([CODE_OF_CONDUCT.md](https://github.com/chaoss/.github/blob/main/CODE_OF_CONDUCT.md)). Augur is a welcoming community that is open to all, regardless if you're working on your 1000th contribution to open source or your 1st. -We strongly believe that much of what makes open source so great is the incredible communities it brings together, so we invite you to join us! +*If you would like your project or organization listed here, please file a Pull Request!* ## License, Copyright, and Funding -Copyright © 2025 University of Missouri, Sean Goggins, and Derek Howard. +CollectOSS is free software: you can redistribute it and/or modify it under the terms of the MIT License as published by the Open Source Initiative. See the [LICENSE](LICENSE) file for more details. -Augur is free software: you can redistribute it and/or modify it under the terms of the MIT License as published by the Open Source Initiative. See the [LICENSE](LICENSE) file for more details. -This work has been funded through the Alfred P. Sloan Foundation, Mozilla, The Reynolds Journalism Institute, contributions from VMWare, Red Hat Software, Grace Hopper's Open Source Day, GitHub, Microsoft, Twitter, Adobe, the Gluster Project, Open Source Summit (NA/Europe), and the Linux Foundation Compliance Summit. -Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien, Remy Decausemaker, others omitted due to the memory limitations of project maintainers, and 15 Google Summer of Code Students. -## Maintainers & Contributors +## Credits -Refer to [CONTRIBUTORS.md](./CONTRIBUTORS.md) for detailed information about project maintainers, contributors, and GSoC participants. +Refer to [CREDITS.md](./CREDITS.md) for detailed information about the people and funding that have helped make this project possible. diff --git a/SECURITY.md b/SECURITY.md index 3346259bc..df9f440c5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,38 +2,29 @@ ## Supported Versions -These versions of Augur are currently supported with security updates. - -| Version | Supported | -| ------- | ------------------ | -| 0.50.3 | :white_check_mark: | -| 0.50.2 | :white_check_mark: | -| 0.50.1 | :white_check_mark: | -| 0.50.0 | :white_check_mark: | -| 0.44.5 | :white_check_mark: | -| 0.44.3 | :white_check_mark: | -| 0.44.2 | :white_check_mark: | -| 0.44.1 | :white_check_mark: | -| 0.44.0 | :white_check_mark: | -| 0.27.x | :x: | -| 0.26.x | :x: | -| 0.25.x | :x: | -| 0.24.x | :x: | -| 0.23.x | :x: | -| 0.21.x | :x: | -| 0.20.x | :x: | -| 0.19.x | :x: | -| 0.18.x | :x: | -| 0.17.x | :x: | -| 0.16.x | :x: | -| 0.15.x | :x: | -| 0.14.x | :x: | -| 0.13.x | :x: | -| 0.12.x | :x: | -| 0.11.x | :x: | -| 0.10.x | :x: | -| < 0.10 | :x: | +We currently provide security updates for the latest tagged release of CollectOSS. + +Older versions are not actively supported. In exceptional circumstances, maintainers may choose to backport fixes on a case-by-case basis. ## Reporting a Vulnerability -Please report vulnerabilities using GitHub Issues +**IMPORTANT: Do not report security vulnerabilities using public GitHub Issues or public discussions.** + +### Private Disclosure Process + +If you discover a security vulnerability in CollectOSS, please report it privately by opening a [New Private Vulnerability Report](https://github.com/chaoss/collectoss/security/advisories/new). Please fill out the provided advisory template to ensure we have all the details (Impact, Reproduction Steps, and Affected Versions) needed to investigate. + +### Responsible Disclosure Guidelines + +We do our best to follow responsible disclosure practices: + +- **Credit**: We will acknowledge your discovery in security release notes (unless you prefer anonymity) +- **Coordination**: We will work with you to coordinate the disclosure and release timeline based on our capacity to resolve the issue +- **Communication**: We aim to remain communicative and keep your ticket updated with the status so you know what to expect +- **No Public Issues**: Please avoid creating public GitHub issues pull requests, branches, or forks for developing fixes to security vulnerabilities unless told otherwise. All of these can leak the existence of the vulnerability before it is fully fixed. + +Thank you for helping keep CollectOSS secure! + +## Security Response Committee + +The current Security Response Committee consists of the [CollectOSS Maintainers](./MAINTAINERS.md) diff --git a/alembic.ini b/alembic.ini index c36965bb8..7d0469d9d 100644 --- a/alembic.ini +++ b/alembic.ini @@ -2,7 +2,7 @@ [alembic] # path to migration scripts -script_location = augur/application/schema/alembic +script_location = collectoss/application/schema/alembic # template used to generate migration files file_template = %%(rev)s_%%(slug)s diff --git a/augur/api/README.md b/augur/api/README.md deleted file mode 100644 index 4be10a55a..000000000 --- a/augur/api/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# API - -## General API Information - -1. It is served via Flask and Gunicorn - - -## API Developer Details - -### Starting API - -The api is started using a subprocess.Popen() on this command gunicorn -c `` -b `` --preload augur.api.server:app. This loads the gunicorn configuration from the location specified in the variable ``, and binds the gunicorn process to the ip and port specified in the `` variable. The gunicorn_location is a path to the `gunicorn_config.py` file in the `augur/api` directory. Then is tries to find the app variable in augur.api.server so it can use this as the Flask app. This cases these three lines of code to execute at the bottom of server.py: -```python -server = Server() -server.create_app() -app = server.get_app() -``` -So then Gunicorn uses this app to load the server. Note: Those three lines above are executed first because we are using preloading. This means that the Flask app is created first and then gunicorn gets it and copies it to all the workers. - -### Config - -The config located in `augur/api/gunicorn_conf.py` loads a default configuration and then if the config table in the augur_operation schema contains gunicorn config values they override the defaults. - -### Routes - -The routes are located in the augur/api/routes directory. These are added to the Flask app in the create_app() method in the Server class. This is done by calling create_all_routes() and passing the Flask app to it. create_all_routes() then gets a list of all of route files in the routes folder, and imports them. Then it calls the create_routes() function which must be defines in all the route files, and passes the flask app so the routes can be added to the app. For more information on routes please see the README in the rotues directory. - -### Metrics - -The metrics are located in the augur/api/metrics directory. They are a special kind of route that are created using the `@register_metric` decorator instead of the `@app.route` decorator. These are added to the Flask app in the create_app() method in the Server class. This is done by calling create_metrics(). create_metrics() then gets list of the metrics files in the metrics direcory, and imports them. Then it calls add_metrics() and passes the metrics file. add_metrics() then gets all the functions that are metrics and adds them to the Flask app. This is a very simplified version of how the metrics are added to the Flask app. Please see the README in the metrics direcory to learn more. \ No newline at end of file diff --git a/augur/api/routes/manager.py b/augur/api/routes/manager.py deleted file mode 100755 index 739a35ace..000000000 --- a/augur/api/routes/manager.py +++ /dev/null @@ -1,384 +0,0 @@ -#SPDX-License-Identifier: MIT -""" -Creates routes for the manager -""" - - -# TODO: Need to come back and fix this later - -import logging -import time -import requests -import sqlalchemy as s -from sqlalchemy import exc -from flask import request, Response -# from augur.config import AugurConfig -import os -import traceback - -from augur.api.routes import AUGUR_API_VERSION -from ..server import app - -logger = logging.getLogger(__name__) - - - -# @app.route('/{}/add-repos'.format(AUGUR_API_VERSION), methods=['POST']) -# def add_repos(): -# """ returns list of successfully inserted repos and repos that caused an error -# adds repos belonging to any user or group to an existing augur repo group -# 'repos' are in the form org/repo, user/repo, or maybe even a full url -# """ -# if authenticate_request(server.augur_app, request): -# group = request.json['group'] -# repo_manager = Repo_insertion_manager(group, engine) -# group_id = repo_manager.get_org_id() -# errors = {} -# errors['invalid_inputs'] = [] -# errors['failed_records'] = [] -# success = [] -# repos = request.json['repos'] -# for repo in repos: -# url = Git_string(repo) -# url.clean_full_string() -# try: #need to test because we require org/repo or full git url -# url.is_repo() -# repo_name = url.get_repo_name() -# repo_parent = url.get_repo_organization() -# except ValueError: -# errors['invalid_inputs'].append(repo) -# else: -# try: -# repo_id = repo_manager.insert_repo(group_id, repo_parent, repo_name) -# except exc.SQLAlchemyError: -# errors['failed_records'].append(repo_name) -# else: -# success.append(get_inserted_repo(group_id, repo_id, repo_name, group, repo_manager.github_urlify(repo_parent, repo_name))) - -# status_code = 200 -# summary = {'repos_inserted': success, 'repos_not_inserted': errors} -# summary = json.dumps(summary) -# else: -# status_code = 401 -# summary = json.dumps({'error': "Augur API key is either missing or invalid"}) - -# return Response(response=summary, -# status=status_code, -# mimetype="application/json") - -# @app.route('/{}/create-repo-group'.format(AUGUR_API_VERSION), methods=['POST']) -# def create_repo_group(): -# if authenticate_request(server.augur_app, request): -# group = request.json['group'] -# repo_manager = Repo_insertion_manager(group, engine) -# summary = {} -# summary['errors'] = [] -# summary['repo_groups_created'] = [] - -# if group == '': -# summary['errors'].append("invalid group name") -# return Response(response=summary, status=200, mimetype="application/json") - -# try: -# group_id = repo_manager.get_org_id() -# except TypeError: -# try: -# group_id = repo_manager.insert_repo_group() -# except TypeError: -# summary['errors'].append("couldn't create group") -# else: -# summary['repo_groups_created'].append({"repo_group_id": group_id, "rg_name": group}) -# else: -# summary['errors'].append("group already exists") - -# summary = json.dumps(summary) -# status_code = 200 -# else: -# status_code = 401 -# summary = json.dumps({'error': "Augur API key is either missing or invalid"}) - -# return Response(response=summary, -# status=status_code, -# mimetype="application/json") - -# @app.route('/{}/import-org'.format(AUGUR_API_VERSION), methods=['POST']) -# def add_repo_group(): -# """ creates a new augur repo group and adds to it the given organization or user's repos -# takes an organization or user name -# """ -# if authenticate_request(server.augur_app, request): -# group = request.json['org'] -# repo_manager = Repo_insertion_manager(group, engine) -# summary = {} -# summary['group_errors'] = [] -# summary['failed_repo_records'] = [] -# summary['repo_records_created'] = [] -# group_exists = False -# try: -# #look for group in augur db -# group_id = repo_manager.get_org_id() -# except TypeError: -# #look for group on github -# if repo_manager.group_exists_gh(): -# try: -# group_id = repo_manager.insert_repo_group() -# except TypeError: -# summary['group_errors'].append("failed to create group") -# else: -# group_exists = True -# else: -# summary['group_errors'].append("could not locate group in database or on github") -# else: -# group_exists = True - -# if group_exists: -# summary['group_id'] = str(group_id) -# summary['rg_name'] = group -# try: -# repos_gh = repo_manager.fetch_repos() -# repos_in_augur = repo_manager.get_existing_repos(group_id) -# repos_db_set = set() -# for name in repos_in_augur: -# #repo_git is more reliable than repo name, so we'll just grab everything after the last slash -# name = (name['repo_git'].rsplit('/', 1)[1]) -# repos_db_set.add(name) -# repos_to_insert = set(repos_gh) - repos_db_set - -# for repo in repos_to_insert: -# try: -# repo_id = repo_manager.insert_repo(group_id, group, repo) -# except exc.SQLAlchemyError: -# summary['failed_repo_records'].append(repo) -# else: -# summary['repo_records_created'].append(get_inserted_repo(group_id, repo_id, repo, group, repo_manager.github_urlify(group, repo))) -# except requests.ConnectionError: -# summary['group_errors'] = "failed to find the group's child repos" -# logger.debug(f'Error is: {e}.') -# except Exception as e: -# logger.debug(f'Error is: {e}.') - -# status_code = 200 -# summary = json.dumps(summary) -# else: -# status_code = 401 -# summary = json.dumps({'error': "Augur API key is either missing or invalid"}) - -# return Response(response=summary, -# status=status_code, -# mimetype="application/json") - -# def get_inserted_repo(groupid, repoid, reponame, groupname, url): -# inserted_repo={} -# inserted_repo['repo_group_id'] = str(groupid) -# inserted_repo['repo_id'] = str(repoid) -# inserted_repo['repo_name'] = reponame -# inserted_repo['rg_name'] = groupname -# inserted_repo['url'] = url -# return inserted_repo - -# class Repo_insertion_manager(): -# ROOT_AUGUR_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - -# def __init__(self, organization_name, database_connection): -# #self.initialize_logging() -# self.org = organization_name -# self.db = database_connection -# ## added for keys -# self._root_augur_dir = Repo_insertion_manager.ROOT_AUGUR_DIR -# self.augur_config = AugurConfig(self._root_augur_dir) -# ########## - - -# def get_existing_repos(self, group_id): -# """returns repos belonging to repogroup in augur db""" -# select_repos_query = s.sql.text(""" -# SELECT repo_git from augur_data.repo -# WHERE repo_group_id = :repo_group_id -# """) -# select_repos_query = select_repos_query.bindparams(repo_group_id = group_id) -# result = self.db.execute(select_repos_query) -# return result.fetchall() - -# ## This doesn't permit importing of an individual's repo, as they don't show up under "orgs" -# # def group_exists_gh(self): -# # url = url = "https://api.github.com/orgs/{}".format(self.org) -# # res = requests.get(url).json() -# # try: -# # if res['message'] == "Not Found": -# # return False -# # except KeyError: -# # return True - -# ## Revised Version of Method -# def group_exists_gh(self): -# url = url = "https://api.github.com/orgs/{}".format(self.org) -# ## attempting to add key due to rate limiting -# gh_api_key = self.augur_config.get_value('Database', 'key') -# self.headers = {'Authorization': 'token %s' % gh_api_key} -# #r = requests.get(url=cntrb_url, headers=self.headers) -# ####### Original request code -# # res = requests.get(url).json() -# ######## -# res = requests.get(url=url, headers=self.headers).json() -# try: -# if res['message'] == "Not Found": -# url = url = "https://api.github.com/users/{}".format(self.org) -# res = requests.get(url=url, headers=self.headers).json() -# if res['message'] == "Not Found": -# return False -# except KeyError: -# return True - -# def insert_repo(self, orgid, given_org, reponame): -# """creates a new repo record""" -# insert_repo_query = s.sql.text(""" -# INSERT INTO augur_data.repo(repo_group_id, repo_git, repo_status, -# tool_source, tool_version, data_source, data_collection_date) -# VALUES (:repo_group_id, :repo_git, 'New', 'CLI', 1.0, 'Git', CURRENT_TIMESTAMP) -# RETURNING repo_id -# """) -# repogit = self.github_urlify(given_org, reponame) -# insert_repo_query = insert_repo_query.bindparams(repo_group_id = int(orgid), repo_git = repogit) -# result = self.db.execute(insert_repo_query).fetchone() -# return result['repo_id'] - -# def github_urlify(self, org, repo): -# return "https://github.com/" + org + "/" + repo - -# def get_org_id(self): -# select_group_query = s.sql.text(""" -# SELECT repo_group_id -# FROM augur_data.repo_groups -# WHERE rg_name = :group_name -# """) -# select_group_query = select_group_query.bindparams(group_name = self.org) -# result = self.db.execute(select_group_query) -# row = result.fetchone() -# return row['repo_group_id'] - -# def insert_repo_group(self): -# """creates a new repo_group record and returns its id""" -# insert_group_query = s.sql.text(""" -# INSERT INTO augur_data.repo_groups(rg_name, rg_description, rg_website, rg_recache, rg_last_modified, rg_type, -# tool_source, tool_version, data_source, data_collection_date) -# VALUES (:group_name, '', '', 1, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', 1.0, 'Git', CURRENT_TIMESTAMP) -# RETURNING repo_group_id -# """) -# insert_group_query = insert_group_query.bindparams(group_name = self.org) -# result = self.db.execute(insert_group_query) -# row = result.fetchone() -# return row['repo_group_id'] - -# def fetch_repos(self): -# """uses the github api to return repos belonging to the given organization""" -# gh_api_key = self.augur_config.get_value('Database', 'key') -# self.headers = {'Authorization': 'token %s' % gh_api_key} -# repos = [] -# page = 1 -# url = self.paginate(page) -# res = requests.get(url, headers=self.headers).json() -# while res: -# for repo in res: -# repos.append(repo['name']) -# page += 1 -# res = requests.get(self.paginate(page)).json() -# return repos - -# ## Modified pagination to account for github orgs that look like orgs but are actually users. -# def paginate(self, page): -# ### Modified here to incorporate the use of a GitHub API Key -# gh_api_key = self.augur_config.get_value('Database', 'key') -# self.headers = {'Authorization': 'token %s' % gh_api_key} -# url = "https://api.github.com/orgs/{}/repos?per_page=100&page={}" -# res = requests.get(url, headers=self.headers).json() -# if res['message'] == "Not Found": -# url = "https://api.github.com/users/{}/repos?per_page=100&page={}" -# res = requests.get(url=url, headers=self.headers).json() -# return url.format(self.org, str(page)) - - -# #r = requests.get(url=cntrb_url, headers=self.headers) -# ####### Original request code -# # res = requests.get(url).json() -# ######## -# res = requests.get(url=url, headers=self.headers).json() - - - -# # url = "https://api.github.com/orgs/{}/repos?per_page=100&page={}" -# # res = requests.get(url).json() -# # if res['message'] == "Not Found": -# # url = "https://api.github.com/users/{}/repos?per_page=100&page={}" -# # res = requests.get(url).json() -# # return url.format(self.org, str(page)) - -# class Git_string(): -# """ represents possible repo, org or username arguments """ -# def __init__(self, string_to_process): -# self.name = string_to_process - -# def clean_full_string(self): -# """remove trailing slash, protocol, and source if present""" -# org = self.name -# if org.endswith('/'): -# org = org[:-1] -# if org.startswith('https://'): -# org = org[8:] -# slash_index = org.find('/') -# org = org[slash_index+1:] -# if org.startswith('git://'): -# org = org[6:] -# slash_index = org.find('/') -# org = org[slash_index+1:] -# self.name = org - -# def is_repo(self): -# """test for org/repo or user/repo form""" -# slash_count = 0 -# for char in self.name: -# if char == '/': -# slash_count += 1 -# if slash_count == 1: -# return -# else: -# raise ValueError - -# def get_repo_organization(self): -# org = self.name -# return org[:org.find('/')] - -# def get_repo_name(self): -# repo = self.name -# return repo[repo.find('/')+1:] - -# def authenticate_request(augur_app, request): - -# # do I like doing it like this? not at all -# # do I have the time to implement a better solution right now? not at all -# user = augur_app.config.get_value('Database', 'user') -# password = augur_app.config.get_value('Database', 'password') -# host = augur_app.config.get_value('Database', 'host') -# port = augur_app.config.get_value('Database', 'port') -# dbname = augur_app.config.get_value('Database', 'name') - -# DB_STR = 'postgresql://{}:{}@{}:{}/{}'.format( -# user, password, host, port, dbname -# ) - -# operations_db = s.create_engine(DB_STR, poolclass=s.pool.NullPool) - -# update_api_key_sql = s.sql.text(""" -# SELECT value FROM augur_operations.augur_settings WHERE setting='augur_api_key'; -# """) - -# retrieved_api_key = operations_db.execute(update_api_key_sql).fetchone()[0] - -# try: -# given_api_key = request.json['augur_api_key'] -# except KeyError: -# return False - -# if given_api_key == retrieved_api_key and given_api_key != "invalid_key": -# return True -# else: -# return False diff --git a/augur/application/log_analysis/http/empty_index.html b/augur/application/log_analysis/http/empty_index.html deleted file mode 100644 index 47e385041..000000000 --- a/augur/application/log_analysis/http/empty_index.html +++ /dev/null @@ -1,91 +0,0 @@ - - - - - - -

Select a worker:

- - - - - - - - diff --git a/augur/application/log_analysis/http/index.html b/augur/application/log_analysis/http/index.html deleted file mode 100644 index 47e385041..000000000 --- a/augur/application/log_analysis/http/index.html +++ /dev/null @@ -1,91 +0,0 @@ - - - - - - -

Select a worker:

- - - - - - - - diff --git a/augur/static/img/auggie_shrug.png b/augur/static/img/auggie_shrug.png deleted file mode 100644 index f53cac189..000000000 Binary files a/augur/static/img/auggie_shrug.png and /dev/null differ diff --git a/augur/static/img/augur_logo.png b/augur/static/img/augur_logo.png deleted file mode 100644 index 7ef6a4ca8..000000000 Binary files a/augur/static/img/augur_logo.png and /dev/null differ diff --git a/augur/static/img/augur_logo_black.png b/augur/static/img/augur_logo_black.png deleted file mode 100644 index 5256f232e..000000000 Binary files a/augur/static/img/augur_logo_black.png and /dev/null differ diff --git a/augur/tasks/data_analysis/insight_worker/__init__.py b/augur/tasks/data_analysis/insight_worker/__init__.py deleted file mode 100644 index 1b6ff08fb..000000000 --- a/augur/tasks/data_analysis/insight_worker/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -#SPDX-License-Identifier: MIT -"""augur_worker_github - Augur Worker that collects GitHub data""" - -__version__ = '0.0.2' -__author__ = 'Augur Team ' -__all__ = [] diff --git a/augur/tasks/data_analysis/message_insights/__init__.py b/augur/tasks/data_analysis/message_insights/__init__.py deleted file mode 100644 index 165ea33e6..000000000 --- a/augur/tasks/data_analysis/message_insights/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -#SPDX-License-Identifier: MIT - -"""message_insights - Augur Worker that analyzes PR and issue messages""" - -__version__ = '0.3.1' -__author__ = 'Augur Team ' -__all__ = [] diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/__init__.py b/augur/tasks/data_analysis/pull_request_analysis_worker/__init__.py deleted file mode 100644 index c67856244..000000000 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""message_insights_worker - Augur Worker that predicts acceptance of a PR""" - -__version__ = '0.0.0' -__author__ = 'Augur Team ' -__all__ = [] \ No newline at end of file diff --git a/augur/tasks/git/dependency_libyear_tasks/tasks.py b/augur/tasks/git/dependency_libyear_tasks/tasks.py deleted file mode 100644 index fbf121b2a..000000000 --- a/augur/tasks/git/dependency_libyear_tasks/tasks.py +++ /dev/null @@ -1,12 +0,0 @@ -import logging -from augur.tasks.git.dependency_libyear_tasks.core import * -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask - -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def process_libyear_dependency_metrics(self, repo_git): - #raise NotImplementedError - - logger = logging.getLogger(process_libyear_dependency_metrics.__name__) - - deps_libyear_model(logger, repo_git) \ No newline at end of file diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py deleted file mode 100644 index dc0cd9472..000000000 --- a/augur/tasks/git/scc_value_tasks/tasks.py +++ /dev/null @@ -1,13 +0,0 @@ -import logging -from augur.application.db.lib import get_session -from augur.tasks.git.scc_value_tasks.core import * -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask - - -@celery.task(base=AugurFacadeRepoCollectionTask) -def process_scc_value_metrics(repo_git): - - logger = logging.getLogger(process_scc_value_metrics.__name__) - - value_model(logger,repo_git,) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/facade_worker/__init__.py b/augur/tasks/git/util/facade_worker/facade_worker/__init__.py deleted file mode 100644 index 1753083d3..000000000 --- a/augur/tasks/git/util/facade_worker/facade_worker/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -#SPDX-License-Identifier: MIT -"""augur_worker_github - Augur Worker that collects GitHub data""" - -__version__ = '1.3.0' -__author__ = 'Augur Team ' -__all__ = [] diff --git a/augur/tasks/github/__init__.py b/augur/tasks/github/__init__.py deleted file mode 100644 index 63d68da41..000000000 --- a/augur/tasks/github/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from augur.tasks.github.contributors import * -from augur.tasks.github.events import * -from augur.tasks.github.issues import * -from augur.tasks.github.messages import * -from augur.tasks.github.pull_requests.tasks import * -from augur.tasks.github.repo_info.tasks import * -from augur.tasks.github.releases.tasks import * diff --git a/augur/tasks/github/augur-notes.code-workspace b/augur/tasks/github/augur-notes.code-workspace deleted file mode 100644 index f99c46d48..000000000 --- a/augur/tasks/github/augur-notes.code-workspace +++ /dev/null @@ -1,14 +0,0 @@ -{ - "folders": [ - { - "path": "../../../../../augurlabs/augur-notes" - }, - { - "path": "../../.." - }, - { - "path": "../../../../../sociallycompute/project2025" - } - ], - "settings": {} -} \ No newline at end of file diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py deleted file mode 100644 index 55b2281ad..000000000 --- a/augur/tasks/github/facade_github/core.py +++ /dev/null @@ -1,134 +0,0 @@ -from augur.tasks.github.facade_github.contributor_interfaceable.contributor_interface import * -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util.github_task_session import * -from augur.application.db.models import * -from augur.tasks.util.AugurUUID import GithubUUID -from augur.application.db.lib import bulk_insert_dicts, batch_insert_contributors -from augur.tasks.github.util.github_data_access import GithubDataAccess - - - - -def query_github_contributors(logger, key_auth, github_url): - - """ Data collection function - Query the GitHub API for contributors - """ - - # Set platform id to 1 since it is a github method - platform_id = 1 - - # Extract owner/repo from the url for the endpoint - try: - owner, name = get_owner_repo(github_url) - except IndexError as e: - logger.error(f"Encountered bad url: {github_url}") - raise e - - # Set the base of the url and place to hold contributors to insert - contributors_url = ( - f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?state=all" - ) - - # Get contributors that we already have stored - # Set our duplicate and update column map keys (something other than PK) to - # check dupicates/needed column updates with - table = 'contributors' - table_pkey = 'cntrb_id' - update_col_map = {'cntrb_email': 'email'} - duplicate_col_map = {'cntrb_login': 'login'} - - github_data_access = GithubDataAccess(key_auth, logger) - - contributor_count = github_data_access.get_resource_count(contributors_url) - - logger.info("Count of contributors needing insertion: " + str(contributor_count) + "\n") - - if contributor_count == 0: - return - - for repo_contributor in github_data_access.paginate_resource(contributors_url): - try: - # Need to hit this single contributor endpoint to get extra data including... - # `created at` - # i think that's it - cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) - - - logger.info("Hitting endpoint: " + cntrb_url + " ...\n") - #r = hit_api(session.oauths, cntrb_url, logger) - #contributor = r.json() - - contributor = github_data_access.get_resource(cntrb_url) - - #logger.info(f"Contributor: {contributor} \n") - company = None - location = None - email = None - if 'company' in contributor: - company = contributor['company'] - if 'location' in contributor: - location = contributor['location'] - if 'email' in contributor: - email = contributor['email'] - canonical_email = contributor['email'] - - #TODO get and store an owner id - - #Generate ID for cntrb table - #cntrb_id = AugurUUID(session.platform_id,contributor['id']).to_UUID() - cntrb_id = GithubUUID() - cntrb_id["user"] = int(contributor['id']) - cntrb_id["platform"] = platform_id - - cntrb = { - "cntrb_id" : cntrb_id.to_UUID(), - "cntrb_login": contributor['login'], - "cntrb_created_at": contributor['created_at'], - "cntrb_email": email, - "cntrb_company": company, - "cntrb_location": location, - # "cntrb_type": , dont have a use for this as of now ... let it default to null - "cntrb_canonical": canonical_email, - "gh_user_id": contributor['id'], - "gh_login": contributor['login'], - "gh_url": contributor['url'], - "gh_html_url": contributor['html_url'], - "gh_node_id": contributor['node_id'], #This is what we are dup checking - "gh_avatar_url": contributor['avatar_url'], - "gh_gravatar_id": contributor['gravatar_id'], - "gh_followers_url": contributor['followers_url'], - "gh_following_url": contributor['following_url'], - "gh_gists_url": contributor['gists_url'], - "gh_starred_url": contributor['starred_url'], - "gh_subscriptions_url": contributor['subscriptions_url'], - "gh_organizations_url": contributor['organizations_url'], - "gh_repos_url": contributor['repos_url'], - "gh_events_url": contributor['events_url'], - "gh_received_events_url": contributor['received_events_url'], - "gh_type": contributor['type'], - "gh_site_admin": contributor['site_admin'], - "cntrb_last_used" : None if 'updated_at' not in contributor else contributor['updated_at'], - "cntrb_full_name" : None if 'name' not in contributor else contributor['name'], - #"tool_source": session.tool_source, - #"tool_version": session.tool_version, - #"data_source": session.data_source - } - - #insert cntrb to table. - #session.logger.info(f"Contributor: {cntrb} \n") - batch_insert_contributors(logger, [cntrb]) - - except Exception as e: - logger.error("Caught exception: {}".format(e)) - logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) - raise e - -# Get all the committer data for a repo. -# Used by facade in facade03analyzecommit -def grab_committer_list(logger, key_auth, repo_git, platform="github"): - - # Create API endpoint from repo_id - query_github_contributors(logger, key_auth, repo_git) - \ No newline at end of file diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py deleted file mode 100644 index 53a3d6648..000000000 --- a/augur/tasks/github/facade_github/tasks.py +++ /dev/null @@ -1,339 +0,0 @@ -import logging - - -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.tasks.github.facade_github.core import * -from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_name, get_contributors_by_full_name, get_repo_by_repo_git, batch_insert_contributors -from augur.application.db.lib import get_session, execute_session_query -from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * - - -def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id): - - github_data_access = GithubDataAccess(auth, logger) - - for contributor in contributorQueue: - # Get the email from the commit data - email = contributor['email_raw'] if 'email_raw' in contributor else contributor['email'] - - name = contributor['name'] - - # check the email to see if it already exists in contributor_aliases - - # Look up email to see if resolved - alias_table_data = get_contributor_aliases_by_email(email) - if len(alias_table_data) >= 1: - # Move on if email resolved - logger.debug( - f"Email {email} has been resolved earlier.") - - continue - - #Check the unresolved_commits table to avoid hitting endpoints that we know don't have relevant data needlessly - - - unresolved_query_result = get_unresolved_commit_emails_by_name(name) - - if len(unresolved_query_result) >= 1: - - logger.debug(f"Commit data with email {email} has been unresolved in the past, skipping...") - continue - - login = None - - #Check the contributors table for a login for the given name - - contributors_with_matching_name = get_contributors_by_full_name(name) - - if not contributors_with_matching_name or len(contributors_with_matching_name) > 1: - logger.debug("Failed local login lookup") - else: - login = contributors_with_matching_name[0].gh_login - - - # Try to get the login from the commit sha - if login == None or login == "": - login = get_login_with_commit_hash(logger, auth, contributor, repo_id) - - if login == None or login == "": - logger.warning("Failed to get login from commit hash") - # Try to get the login from supplemental data if not found with the commit hash - login = get_login_with_supplemental_data(logger, auth,contributor) - - if login == None or login == "": - logger.error("Failed to get login from supplemental data!") - continue - - url = ("https://api.github.com/users/" + login) - - try: - user_data = github_data_access.get_resource(url) - except UrlNotFoundException as e: - logger.warning(f"User of {login} not found on github. Skipping...") - continue - - # Use the email found in the commit data if api data is NULL - emailFromCommitData = contributor['email_raw'] if 'email_raw' in contributor else contributor['email'] - - - # Get name from commit if not found by GitHub - name_field = contributor['commit_name'] if 'commit_name' in contributor else contributor['name'] - - - cntrb_id = GithubUUID() - cntrb_id["user"] = int(user_data['id']) - cntrb_id["platform"] = platform_id - - # try to add contributor to database - cntrb = { - "cntrb_id" : cntrb_id.to_UUID(), - "cntrb_login": user_data['login'], - "cntrb_created_at": user_data['created_at'], - "cntrb_email": user_data['email'] if 'email' in user_data else None, - "cntrb_company": user_data['company'] if 'company' in user_data else None, - "cntrb_location": user_data['location'] if 'location' in user_data else None, - # "cntrb_type": , dont have a use for this as of now ... let it default to null - "cntrb_canonical": user_data['email'] if 'email' in user_data and user_data['email'] is not None else emailFromCommitData, - "gh_user_id": user_data['id'], - "gh_login": user_data['login'], - "gh_url": user_data['url'], - "gh_html_url": user_data['html_url'], - "gh_node_id": user_data['node_id'], - "gh_avatar_url": user_data['avatar_url'], - "gh_gravatar_id": user_data['gravatar_id'], - "gh_followers_url": user_data['followers_url'], - "gh_following_url": user_data['following_url'], - "gh_gists_url": user_data['gists_url'], - "gh_starred_url": user_data['starred_url'], - "gh_subscriptions_url": user_data['subscriptions_url'], - "gh_organizations_url": user_data['organizations_url'], - "gh_repos_url": user_data['repos_url'], - "gh_events_url": user_data['events_url'], - "gh_received_events_url": user_data['received_events_url'], - "gh_type": user_data['type'], - "gh_site_admin": user_data['site_admin'], - "cntrb_last_used": None if 'updated_at' not in user_data else user_data['updated_at'], - # Get name from commit if api doesn't get it. - "cntrb_full_name": name_field if 'name' not in user_data or user_data['name'] is None else user_data['name'], - #"tool_source": interface.tool_source, - #"tool_version": interface.tool_version, - #"data_source": interface.data_source - } - - - - #Executes an upsert with sqlalchemy - cntrb_natural_keys = ['cntrb_id'] - batch_insert_contributors(logger, [cntrb]) - - try: - # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey - insert_alias(logger, cntrb, emailFromCommitData) - except LookupError as e: - logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) - logger.error( - f"Contributor id not able to be found in database despite the user_id existing. Something very wrong is happening. Error: {e}") - return - - - #Replace each instance of a single or double quote with escape characters - #for postgres - escapedEmail = email.replace('"',r'\"') - escapedEmail = escapedEmail.replace("'",r'\'') - # Resolve any unresolved emails if we get to this point. - # They will get added to the alias table later - # Do this last to absolutely make sure that the email was resolved before we remove it from the unresolved table. - query = s.sql.text(""" - DELETE FROM unresolved_commit_emails - WHERE email='{}' - """.format(escapedEmail)) - - logger.debug(f"Updating now resolved email {email}") - - try: - execute_sql(query) - except Exception as e: - logger.error( - f"Deleting now resolved email failed with error: {e}") - raise e - - - return - - -def link_commits_to_contributor(logger, facade_helper, contributorQueue): - - # # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id. - for cntrb in contributorQueue: - logger.debug( - f"These are the emails and cntrb_id's returned: {cntrb}") - - query = s.sql.text(""" - UPDATE commits - SET cmt_ght_author_id=:cntrb_id - WHERE - (cmt_author_raw_email=:cntrb_email - OR cmt_author_email=:cntrb_email) - AND cmt_ght_author_id is NULL - """).bindparams(cntrb_id=cntrb["cntrb_id"],cntrb_email=cntrb["email"]) - - #engine.execute(query, **data) - facade_helper.insert_or_update_data(query) - - - return - - -# Update the contributors table from the data facade has gathered. -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def insert_facade_contributors(self, repo_git): - - # Set platform id to 1 since this task is github specific - platform_id = 1 - - logger = logging.getLogger(insert_facade_contributors.__name__) - repo = get_repo_by_repo_git(repo_git) - repo_id = repo.repo_id - facade_helper = FacadeHelper(logger) - - with get_session() as session: - query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) - collection_status = execute_session_query(query,'one') - last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None - - # Get all of the commit data's emails and names from the commit table that do not appear - # in the contributors table or the contributors_aliases table. - - logger.info( - "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) - new_contrib_sql = s.sql.text(""" - SELECT DISTINCT - commits.cmt_author_name AS NAME, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'not_unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) - or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - UNION - SELECT DISTINCT - commits.cmt_author_name AS NAME,--commits.cmt_id AS id, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - ORDER BY - hash - """).bindparams(repo_id=repo_id,since_date=last_collected_date) - - #Execute statement with session. - result = execute_sql(new_contrib_sql) - - # Fetch all results immediately to close the database cursor/connection - # This prevents holding the connection open during GitHub API calls - rows = result.mappings().fetchall() - - #print(new_contribs) - - #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) - - - key_auth = GithubRandomKeyAuth(logger) - - # Process results in batches to reduce memory usage - batch = [] - BATCH_SIZE = 1000 - - for row in rows: - batch.append(dict(row)) - - if len(batch) >= BATCH_SIZE: - process_commit_metadata(logger, key_auth, batch, repo_id, platform_id) - batch.clear() - - # Process remaining items in batch - if batch: - process_commit_metadata(logger, key_auth, batch, repo_id, platform_id) - - logger.debug("DEBUG: Got through the new_contribs") - - # sql query used to find corresponding cntrb_id's of emails found in the contributor's table - # i.e., if a contributor already exists, we use it! - resolve_email_to_cntrb_id_sql = s.sql.text(""" - SELECT DISTINCT - cntrb_id, - contributors.cntrb_login AS login, - contributors.cntrb_canonical AS email, - commits.cmt_author_raw_email - FROM - contributors, - commits - WHERE - contributors.cntrb_canonical = commits.cmt_author_raw_email - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND commits.repo_id = :repo_id - UNION - SELECT DISTINCT - contributors_aliases.cntrb_id, - contributors.cntrb_login as login, - contributors_aliases.alias_email AS email, - commits.cmt_author_raw_email - FROM - contributors, - contributors_aliases, - commits - WHERE - contributors_aliases.alias_email = commits.cmt_author_raw_email - AND contributors.cntrb_id = contributors_aliases.cntrb_id - AND commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - """).bindparams(repo_id=repo_id,since_date=last_collected_date) - - - result = execute_sql(resolve_email_to_cntrb_id_sql) - - # Fetch all results immediately to close the database cursor/connection - # This prevents holding the connection open during database UPDATE operations - rows = result.mappings().fetchall() - - # Process results in batches to reduce memory usage - batch = [] - BATCH_SIZE = 1000 - - for row in rows: - batch.append(dict(row)) - - if len(batch) >= BATCH_SIZE: - link_commits_to_contributor(logger, facade_helper, batch) - batch.clear() - - # Process remaining items in batch - if batch: - link_commits_to_contributor(logger, facade_helper, batch) - - return - diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py deleted file mode 100644 index e6acdfa90..000000000 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ /dev/null @@ -1,19 +0,0 @@ -import logging -from augur.tasks.github.pull_requests.commits_model.core import * -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.lib import get_repo_by_repo_git - - - -@celery.task(base=AugurSecondaryRepoCollectionTask) -def process_pull_request_commits(repo_git: str, full_collection: bool) -> None: - - logger = logging.getLogger(process_pull_request_commits.__name__) - - repo = get_repo_by_repo_git(repo_git) - - with GithubTaskManifest(logger) as manifest: - - pull_request_commits_model(repo.repo_id, logger, manifest.augur_db, manifest.key_auth, full_collection) diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py deleted file mode 100644 index be75c88a9..000000000 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ /dev/null @@ -1,18 +0,0 @@ -import logging -from augur.tasks.github.pull_requests.files_model.core import * -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query - -@celery.task(base=AugurSecondaryRepoCollectionTask) -def process_pull_request_files(repo_git: str, full_collection: bool) -> None: - - logger = logging.getLogger(process_pull_request_files.__name__) - - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - - pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth, full_collection) \ No newline at end of file diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py deleted file mode 100644 index 3e2210a7c..000000000 --- a/augur/tasks/github/releases/tasks.py +++ /dev/null @@ -1,22 +0,0 @@ -import logging - -from augur.tasks.github.releases.core import * -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.lib import get_repo_by_repo_git, get_session -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth - - -@celery.task(base=AugurCoreRepoCollectionTask) -def collect_releases(repo_git): - - logger = logging.getLogger(collect_releases.__name__) - - repo_obj = get_repo_by_repo_git(repo_git) - repo_id = repo_obj.repo_id - - key_auth = GithubRandomKeyAuth(logger) - - with get_session() as session: - - releases_model(session, key_auth, logger, repo_git, repo_id) \ No newline at end of file diff --git a/augur/__init__.py b/collectoss/__init__.py similarity index 100% rename from augur/__init__.py rename to collectoss/__init__.py diff --git a/collectoss/api/README.md b/collectoss/api/README.md new file mode 100644 index 000000000..af8c4561e --- /dev/null +++ b/collectoss/api/README.md @@ -0,0 +1,30 @@ +# API + +## General API Information + +1. It is served via Flask and Gunicorn + + +## API Developer Details + +### Starting API + +The api is started using a subprocess.Popen() on this command gunicorn -c `` -b `` --preload collectoss.api.server:app. This loads the gunicorn configuration from the location specified in the variable ``, and binds the gunicorn process to the ip and port specified in the `` variable. The gunicorn_location is a path to the `gunicorn_config.py` file in the `collectoss/api` directory. Then is tries to find the app variable in collectoss.api.server so it can use this as the Flask app. This cases these three lines of code to execute at the bottom of server.py: +```python +server = Server() +server.create_app() +app = server.get_app() +``` +So then Gunicorn uses this app to load the server. Note: Those three lines above are executed first because we are using preloading. This means that the Flask app is created first and then gunicorn gets it and copies it to all the workers. + +### Config + +The config located in `collectoss/api/gunicorn_conf.py` loads a default configuration and then if the config table in the augur_operation schema contains gunicorn config values they override the defaults. + +### Routes + +The routes are located in the collectoss/api/routes directory. These are added to the Flask app in the create_app() method in the Server class. This is done by calling create_all_routes() and passing the Flask app to it. create_all_routes() then gets a list of all of route files in the routes folder, and imports them. Then it calls the create_routes() function which must be defines in all the route files, and passes the flask app so the routes can be added to the app. For more information on routes please see the README in the rotues directory. + +### Metrics + +The metrics are located in the collectoss/api/metrics directory. They are a special kind of route that are created using the `@register_metric` decorator instead of the `@app.route` decorator. These are added to the Flask app in the create_app() method in the Server class. This is done by calling create_metrics(). create_metrics() then gets list of the metrics files in the metrics direcory, and imports them. Then it calls add_metrics() and passes the metrics file. add_metrics() then gets all the functions that are metrics and adds them to the Flask app. This is a very simplified version of how the metrics are added to the Flask app. Please see the README in the metrics direcory to learn more. \ No newline at end of file diff --git a/augur/api/__init__.py b/collectoss/api/__init__.py similarity index 100% rename from augur/api/__init__.py rename to collectoss/api/__init__.py diff --git a/augur/api/gunicorn_conf.py b/collectoss/api/gunicorn_conf.py similarity index 74% rename from augur/api/gunicorn_conf.py rename to collectoss/api/gunicorn_conf.py index 6586b6f69..22c11231a 100644 --- a/augur/api/gunicorn_conf.py +++ b/collectoss/api/gunicorn_conf.py @@ -1,19 +1,19 @@ -# from augur import ROOT_AUGUR_DIRECTORY +# from collectoss import ROOT_PROJECT_REPO_DIRECTORY import multiprocessing import logging import os from pathlib import Path from glob import glob -from augur.application.db.lib import get_value -from augur.application.db import dispose_database_engine +from collectoss.application.db.lib import get_value +from collectoss.application.db import dispose_database_engine logger = logging.getLogger(__name__) -# ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +# ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -# base_log_dir = ROOT_AUGUR_DIRECTORY + "/logs/" +# base_log_dir = ROOT_PROJECT_REPO_DIRECTORY + "/logs/" # Path(base_log_dir).mkdir(exist_ok=True) @@ -25,16 +25,16 @@ if is_dev: - augur_templates_dir = Path.cwd() / "augur/templates" + project_templates_dir = Path.cwd() / "collectoss/templates" - if not augur_templates_dir.is_dir(): + if not project_templates_dir.is_dir(): logger.critical("Could not locate templates in Gunicorn startup") exit(-1) - reload_extra_files = glob(str(augur_templates_dir.resolve() / '**/*.j2'), recursive=True) + reload_extra_files = glob(str(project_templates_dir.resolve() / '**/*.j2'), recursive=True) # Don't want to leave extraneous variables in config scope - del augur_templates_dir + del project_templates_dir del is_dev # set the log location for gunicorn diff --git a/augur/api/metrics/README.md b/collectoss/api/metrics/README.md similarity index 96% rename from augur/api/metrics/README.md rename to collectoss/api/metrics/README.md index 97d90ebbc..d96f0018b 100644 --- a/augur/api/metrics/README.md +++ b/collectoss/api/metrics/README.md @@ -13,7 +13,7 @@ Metrics are standardized endpoints that take a repo_id or a repo_group_id, as pa import datetime import sqlalchemy as s import pandas as pd -from augur.api.util import register_metric +from collectoss.api.util import register_metric ``` 3. Defining the function 1. Add the decorator @register_metric to the function @@ -55,9 +55,9 @@ app = server.get_app() ``` 3. Then `create_app()` calls the function `create_metrics()` 4. Then `create_metrics()` - 1. Gets a list of metric files from the `augur/api/metrics` directory + 1. Gets a list of metric files from the `collectoss/api/metrics` directory 2. Loops through the list of metric files and imports them all - 3. As each file is getting imported all functions with the decorator `@register_metric()` are getting called and the decorator function in `augur/api/util.py` is called + 3. As each file is getting imported all functions with the decorator `@register_metric()` are getting called and the decorator function in `collectoss/api/util.py` is called 4. The `@register_metric()` decorator adds the attribute `is_metric` to the function so we can determine that it is a metric later 5. Loops through the list of metric files and calls `add_metrics(file)` method and passes the metrics file 5. The `add_metrics(file)` method then loops through the file modules and finds the functions that are metrics using the `is_metric` attribute diff --git a/augur/api/metrics/__init__.py b/collectoss/api/metrics/__init__.py similarity index 100% rename from augur/api/metrics/__init__.py rename to collectoss/api/metrics/__init__.py diff --git a/augur/api/metrics/commit.py b/collectoss/api/metrics/commit.py similarity index 99% rename from augur/api/metrics/commit.py rename to collectoss/api/metrics/commit.py index a8e12dca7..de2c84809 100644 --- a/augur/api/metrics/commit.py +++ b/collectoss/api/metrics/commit.py @@ -8,7 +8,7 @@ import pandas as pd from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric() def committers(repo_group_id, repo_id=None, begin_date=None, end_date=None, period='month'): diff --git a/augur/api/metrics/contributor.py b/collectoss/api/metrics/contributor.py similarity index 99% rename from augur/api/metrics/contributor.py rename to collectoss/api/metrics/contributor.py index b89e36e76..4dbc35e8f 100644 --- a/augur/api/metrics/contributor.py +++ b/collectoss/api/metrics/contributor.py @@ -8,7 +8,7 @@ import pandas as pd from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric() diff --git a/augur/api/metrics/deps.py b/collectoss/api/metrics/deps.py similarity index 99% rename from augur/api/metrics/deps.py rename to collectoss/api/metrics/deps.py index 486fd1ec4..ef13aee7d 100644 --- a/augur/api/metrics/deps.py +++ b/collectoss/api/metrics/deps.py @@ -8,7 +8,7 @@ import datetime from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric() diff --git a/augur/api/metrics/experimental.py b/collectoss/api/metrics/experimental.py similarity index 100% rename from augur/api/metrics/experimental.py rename to collectoss/api/metrics/experimental.py diff --git a/augur/api/metrics/insight.py b/collectoss/api/metrics/insight.py similarity index 96% rename from augur/api/metrics/insight.py rename to collectoss/api/metrics/insight.py index e5dad6182..3e5caf265 100644 --- a/augur/api/metrics/insight.py +++ b/collectoss/api/metrics/insight.py @@ -5,7 +5,7 @@ import sqlalchemy as s import pandas as pd -from augur.api.util import register_metric +from collectoss.api.util import register_metric from flask import current_app diff --git a/augur/api/metrics/issue.py b/collectoss/api/metrics/issue.py similarity index 99% rename from augur/api/metrics/issue.py rename to collectoss/api/metrics/issue.py index 3410fbf80..f50bf30a3 100644 --- a/augur/api/metrics/issue.py +++ b/collectoss/api/metrics/issue.py @@ -8,7 +8,7 @@ import pandas as pd from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric() diff --git a/augur/api/metrics/message.py b/collectoss/api/metrics/message.py similarity index 98% rename from augur/api/metrics/message.py rename to collectoss/api/metrics/message.py index 78a8338d9..f76aabd28 100644 --- a/augur/api/metrics/message.py +++ b/collectoss/api/metrics/message.py @@ -9,7 +9,7 @@ import pandas as pd from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric() diff --git a/augur/api/metrics/platform.py b/collectoss/api/metrics/platform.py similarity index 100% rename from augur/api/metrics/platform.py rename to collectoss/api/metrics/platform.py diff --git a/augur/api/metrics/pull_request.py b/collectoss/api/metrics/pull_request.py similarity index 99% rename from augur/api/metrics/pull_request.py rename to collectoss/api/metrics/pull_request.py index 447c9557a..20d6be893 100644 --- a/augur/api/metrics/pull_request.py +++ b/collectoss/api/metrics/pull_request.py @@ -8,7 +8,7 @@ import pandas as pd from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric() def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): diff --git a/augur/api/metrics/release.py b/collectoss/api/metrics/release.py similarity index 99% rename from augur/api/metrics/release.py rename to collectoss/api/metrics/release.py index 890bb481b..8846cb2fc 100644 --- a/augur/api/metrics/release.py +++ b/collectoss/api/metrics/release.py @@ -8,7 +8,7 @@ import pandas as pd from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric() def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): diff --git a/augur/api/metrics/repo_meta.py b/collectoss/api/metrics/repo_meta.py similarity index 98% rename from augur/api/metrics/repo_meta.py rename to collectoss/api/metrics/repo_meta.py index c39922e17..7c4129081 100644 --- a/augur/api/metrics/repo_meta.py +++ b/collectoss/api/metrics/repo_meta.py @@ -9,10 +9,10 @@ import logging from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric -logger = logging.getLogger("augur") +logger = logging.getLogger("collectoss") @register_metric() def code_changes(repo_group_id, repo_id=None, period='week', begin_date=None, end_date=None): @@ -347,12 +347,15 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): @register_metric() def forks(repo_group_id, repo_id=None): - """ - Returns a time series of the fork count + """CHAOSS Technical Fork Metric (Time Series) + + Measures the number of technical forks of a repository on the same code development platform over time. + A technical fork is a platform-native fork (e.g., a GitHub fork),excluding local clones. + Canonical definition: https://chaoss.community/?p=3431 :param repo_group_id: The repository's repo_group_id :param repo_id: The repository's repo_id, defaults to None - :return: Time series of fork count + :return: Time series of technical fork counts """ if not repo_id: forks_SQL = s.sql.text(""" @@ -389,13 +392,17 @@ def forks(repo_group_id, repo_id=None): @register_metric() def fork_count(repo_group_id, repo_id=None): - """ - Returns the latest fork count + """CHAOSS Technical Fork Metric (Latest Value) + + Returns the most recent count of technical forks for a repository. + A technical fork is a platform-native fork on the same code development platform. + Canonical definition: https://chaoss.community/?p=3431 :param repo_group_id: The repository's repo_group_id :param repo_id: The repository's repo_id, defaults to None - :return: Fork count + :return: Latest technical fork count """ + if not repo_id: fork_count_SQL = s.sql.text(""" SELECT a.repo_id, repo_name, a.fork_count AS forks diff --git a/augur/api/metrics/toss.py b/collectoss/api/metrics/toss.py similarity index 98% rename from augur/api/metrics/toss.py rename to collectoss/api/metrics/toss.py index 40a4a12b0..698b4cf31 100644 --- a/augur/api/metrics/toss.py +++ b/collectoss/api/metrics/toss.py @@ -4,7 +4,7 @@ import pandas as pd from flask import current_app -from augur.api.util import register_metric +from collectoss.api.util import register_metric @register_metric(type="toss") diff --git a/augur/api/routes/README.md b/collectoss/api/routes/README.md similarity index 86% rename from augur/api/routes/README.md rename to collectoss/api/routes/README.md index c331c586f..fad191e5b 100644 --- a/augur/api/routes/README.md +++ b/collectoss/api/routes/README.md @@ -13,21 +13,21 @@ The routes directory contains basic routes that are defined using the `@app.rout ```python def create_routes(app): - @app.route('/{}/route_1'.format(AUGUR_API_VERSION), methods=["GET"]) + @app.route('/{}/route_1'.format(API_VERSION), methods=["GET"]) def first_route(): # code to get data return Response(response=data, status=200, mimetype="application/json") - @app.route('/{}/route_2'.format(AUGUR_API_VERSION), methods=["GET", "POST"]) + @app.route('/{}/route_2'.format(API_VERSION), methods=["GET", "POST"]) def route_2(): # code to get data return Response(response=data, status=200, mimetype="application/json") - @app.route('/{}/route_3'.format(AUGUR_API_VERSION), methods=["GET"]) + @app.route('/{}/route_3'.format(API_VERSION), methods=["GET"]) def route_3(): # code to get data @@ -52,7 +52,7 @@ app = server.get_app() ``` 3. Then `create_app()` calls the function `create_all_routes(app)` and passes the Flask app 4. Then `create_all_routes(app)` - 1. Gets a list of route files from the `augur/api/rotues` directory + 1. Gets a list of route files from the `collectoss/api/rotues` directory 2. Loops through the list of route files and imports them all 3. Loops through the list of route files and calls each files create_route(app) function and passes the Flask app 5. The `create_route(app)` function in each file then use the `app` to define all the routes using `@app.route()` diff --git a/augur/api/routes/__init__.py b/collectoss/api/routes/__init__.py similarity index 87% rename from augur/api/routes/__init__.py rename to collectoss/api/routes/__init__.py index 8176dad94..0fc056f58 100644 --- a/augur/api/routes/__init__.py +++ b/collectoss/api/routes/__init__.py @@ -1,4 +1,4 @@ -AUGUR_API_VERSION = 'api/unstable' +API_VERSION = 'api/unstable' from .application import * from .batch import * diff --git a/augur/api/routes/application.py b/collectoss/api/routes/application.py similarity index 87% rename from augur/api/routes/application.py rename to collectoss/api/routes/application.py index a978bb1e6..252819c8e 100644 --- a/augur/api/routes/application.py +++ b/collectoss/api/routes/application.py @@ -5,16 +5,16 @@ import logging from flask import request, jsonify -from augur.api.util import api_key_required, ssl_required +from collectoss.api.util import api_key_required, ssl_required -from augur.application.db.models import User, ClientApplication +from collectoss.application.db.models import User, ClientApplication from ..server import app logger = logging.getLogger(__name__) -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION -@app.route(f"/{AUGUR_API_VERSION}/application", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/application", methods=['GET', 'POST']) @ssl_required @api_key_required def get_application_info(application: ClientApplication): @@ -29,7 +29,7 @@ def get_application_info(application: ClientApplication): return info -@app.route(f"/{AUGUR_API_VERSION}/application/group/repos", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/application/group/repos", methods=['GET', 'POST']) @ssl_required @api_key_required def get_application_group_repos(application: ClientApplication): @@ -71,7 +71,7 @@ def get_application_group_repos(application: ClientApplication): return jsonify(result_dict) -@app.route(f"/{AUGUR_API_VERSION}/application/group/repos/count", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/application/group/repos/count", methods=['GET', 'POST']) @ssl_required @api_key_required def get_application_group_repo_count(application: ClientApplication): @@ -99,7 +99,7 @@ def get_application_group_repo_count(application: ClientApplication): return jsonify(result_dict) -@app.route(f"/{AUGUR_API_VERSION}/application/groups/names", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/application/groups/names", methods=['GET', 'POST']) @ssl_required @api_key_required def get_application_groups(application: ClientApplication): @@ -108,7 +108,7 @@ def get_application_groups(application: ClientApplication): return jsonify({"status": "success", "group_names": result[0]}) -@app.route(f"/{AUGUR_API_VERSION}/application/groups/repos/", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/application/groups/repos/", methods=['GET', 'POST']) @ssl_required @api_key_required def get_application_groups_and_repos(application: ClientApplication): diff --git a/augur/api/routes/auggie.py b/collectoss/api/routes/auggie.py similarity index 98% rename from augur/api/routes/auggie.py rename to collectoss/api/routes/auggie.py index 122e73ea3..18642498f 100644 --- a/augur/api/routes/auggie.py +++ b/collectoss/api/routes/auggie.py @@ -6,7 +6,7 @@ import base64 import sqlalchemy as s import pandas as pd -from augur.api.util import metric_metadata +from collectoss.api.util import metric_metadata import boto3 import json from boto3.dynamodb.conditions import Key, Attr @@ -17,7 +17,7 @@ from ..server import app -AUGUR_API_VERSION = 'api/unstable' +API_VERSION = 'api/unstable' # def annotate(metadata=None, **kwargs): @@ -76,7 +76,7 @@ # print(user_response) # email = user_response["user"]["email"] -# profile_name = 'augur' +# profile_name = 'collectoss' # if os.environ.get('AUGUR_IS_PROD'): # profile_name = 'default' # print("Making Boto3 Session") @@ -181,7 +181,7 @@ # #@annotate(tag='update-auggie-user-tracking') # def update_tracking(metric, body): -# profile_name = 'augur' +# profile_name = 'collectoss' # if os.environ.get('AUGUR_IS_PROD'): # profile_name = 'default' # client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') @@ -224,7 +224,7 @@ # #@annotate(tag='get-auggie-user') # def get_auggie_user(metric, body): -# profile_name = 'augur' +# profile_name = 'collectoss' # if os.environ.get('AUGUR_IS_PROD'): # profile_name = 'default' # client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') @@ -251,7 +251,7 @@ def get_auggie_user(): # response = server.transform(metrics.get_auggie_user, args=arg) # return Response(response=response, status=200, mimetype="application/json") ## From Method - profile_name = 'augur' + profile_name = 'collectoss' if os.environ.get('AUGUR_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') @@ -277,7 +277,7 @@ def update_auggie_user_tracking(): # response = server.transform(metrics.update_tracking, args=arg) # return Response(response=response, status=200, mimetype="application/json") ## From Method - profile_name = 'augur' + profile_name = 'collectoss' if os.environ.get('AUGUR_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') @@ -339,7 +339,7 @@ def slack_login(): print(user_response) email = user_response["user"]["email"] - profile_name = 'augur' + profile_name = 'collectoss' if os.environ.get('AUGUR_IS_PROD'): profile_name = 'default' print("Making Boto3 Session") diff --git a/augur/api/routes/batch.py b/collectoss/api/routes/batch.py similarity index 94% rename from augur/api/routes/batch.py rename to collectoss/api/routes/batch.py index fbbe1a854..a046af446 100644 --- a/augur/api/routes/batch.py +++ b/collectoss/api/routes/batch.py @@ -9,16 +9,16 @@ import sqlalchemy as s from sqlalchemy import exc from flask import request, Response -from augur.api.util import metric_metadata +from collectoss.api.util import metric_metadata import json from ..server import app -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION logger = logging.getLogger(__name__) -@app.route('/{}/batch'.format(AUGUR_API_VERSION), methods=['GET', 'POST']) +@app.route('/{}/batch'.format(API_VERSION), methods=['GET', 'POST']) def batch(): """ Execute multiple requests, submitted as a batch. @@ -101,7 +101,7 @@ def batch(): @apiDescription Returns metadata of batch requests POST JSON of API requests metadata """ -@app.route('/{}/batch/metadata'.format(AUGUR_API_VERSION), methods=['GET', 'POST']) +@app.route('/{}/batch/metadata'.format(API_VERSION), methods=['GET', 'POST']) def batch_metadata(): """ Returns endpoint metadata in batch format diff --git a/augur/api/routes/collection_status.py b/collectoss/api/routes/collection_status.py similarity index 96% rename from augur/api/routes/collection_status.py rename to collectoss/api/routes/collection_status.py index 0f6f3cfac..eaa374f4c 100644 --- a/augur/api/routes/collection_status.py +++ b/collectoss/api/routes/collection_status.py @@ -4,11 +4,11 @@ import json from flask import Response, current_app -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION from ..server import app -@app.route('/{}/collection_status/commits'.format(AUGUR_API_VERSION)) +@app.route('/{}/collection_status/commits'.format(API_VERSION)) def commit_collection_status(): # TODO: make this name automatic - wrapper? commit_collection_sql = s.sql.text(""" SELECT @@ -34,7 +34,7 @@ def commit_collection_status(): # TODO: make this name automatic - wrapper? status=200, mimetype="application/json") -@app.route('/{}/collection_status/issues'.format(AUGUR_API_VERSION)) +@app.route('/{}/collection_status/issues'.format(API_VERSION)) def issue_collection_status(): # TODO: make this name automatic - wrapper? issue_collection_sql = s.sql.text(""" SELECT @@ -98,7 +98,7 @@ def issue_collection_status(): # TODO: make this name automatic - wrapper? status=200, mimetype="application/json") -@app.route('/{}/collection_status/pull_requests'.format(AUGUR_API_VERSION)) +@app.route('/{}/collection_status/pull_requests'.format(API_VERSION)) def pull_request_collection_status(): # TODO: make this name automatic - wrapper? pull_request_collection_sql = s.sql.text(""" SELECT diff --git a/augur/api/routes/complexity.py b/collectoss/api/routes/complexity.py similarity index 95% rename from augur/api/routes/complexity.py rename to collectoss/api/routes/complexity.py index feb58c6c3..11fbf5ebe 100644 --- a/augur/api/routes/complexity.py +++ b/collectoss/api/routes/complexity.py @@ -2,15 +2,15 @@ from flask import Response, current_app, request import pandas as pd import sqlalchemy as s -from augur.api.util import metric_metadata +from collectoss.api.util import metric_metadata import os import requests -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION from ..server import app -@app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/complexity/project_languages'.format(API_VERSION), methods=["GET"]) def get_project_languages(): repo_id = request.args.get('repo_id') @@ -57,7 +57,7 @@ def get_project_languages(): status=200, mimetype="application/json") -@app.route('/{}/complexity/project_files'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/complexity/project_files'.format(API_VERSION), methods=["GET"]) def get_project_files(): project_files_sql = s.sql.text(""" SELECT @@ -96,7 +96,7 @@ def get_project_files(): status=200, mimetype="application/json") -@app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/complexity/project_lines'.format(API_VERSION), methods=["GET"]) def get_project_lines(): repo_id = request.args.get('repo_id') @@ -140,7 +140,7 @@ def get_project_lines(): status=200, mimetype="application/json") -@app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/complexity/project_comment_lines'.format(API_VERSION), methods=["GET"]) def get_project_comment_lines(): repo_id = request.args.get('repo_id') @@ -185,7 +185,7 @@ def get_project_comment_lines(): status=200, mimetype="application/json") -@app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/complexity/project_blank_lines'.format(API_VERSION), methods=["GET"]) def get_project_blank_lines(): repo_id = request.args.get('repo_id') @@ -231,7 +231,7 @@ def get_project_blank_lines(): mimetype="application/json") -@app.route('/{}/complexity/project_file_complexity'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/complexity/project_file_complexity'.format(API_VERSION), methods=["GET"]) def get_project_file_complexity(): project_file_complexity_sql = s.sql.text(""" SELECT diff --git a/augur/api/routes/config.py b/collectoss/api/routes/config.py similarity index 74% rename from augur/api/routes/config.py rename to collectoss/api/routes/config.py index 08618091a..8c66d8b22 100644 --- a/augur/api/routes/config.py +++ b/collectoss/api/routes/config.py @@ -7,17 +7,17 @@ import sqlalchemy as s # Disable the requirement for SSL by setting env["AUGUR_DEV"] = True -from augur.application.config import get_development_flag -from augur.application.db.lib import get_session -from augur.application.db.models import Config -from augur.application.config import AugurConfig -from augur.application.db.session import DatabaseSession +from collectoss.application.config import get_development_flag +from collectoss.application.db.lib import get_session +from collectoss.application.db.models import Config +from collectoss.application.config import SystemConfig +from collectoss.application.db.session import DatabaseSession from ..server import app logger = logging.getLogger(__name__) development = get_development_flag() -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION def generate_upgrade_request(): # https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/426 @@ -27,19 +27,19 @@ def generate_upgrade_request(): return response, 426 -@app.route(f"/{AUGUR_API_VERSION}/config/get", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/config/get", methods=['GET', 'POST']) def get_config(): if not development and not request.is_secure: return generate_upgrade_request() with DatabaseSession(logger, engine=current_app.engine) as session: - config_dict = AugurConfig(logger, session).config.load_config() + config_dict = SystemConfig(logger, session).config.load_config() return jsonify(config_dict), 200 -@app.route(f"/{AUGUR_API_VERSION}/config/update", methods=['POST']) +@app.route(f"/{API_VERSION}/config/update", methods=['POST']) def update_config(): if not development and not request.is_secure: return generate_upgrade_request() diff --git a/augur/api/routes/dei.py b/collectoss/api/routes/dei.py similarity index 83% rename from augur/api/routes/dei.py rename to collectoss/api/routes/dei.py index 44fe01461..64af957bf 100644 --- a/augur/api/routes/dei.py +++ b/collectoss/api/routes/dei.py @@ -7,23 +7,23 @@ from flask import request, jsonify, render_template, send_file, current_app from pathlib import Path -from augur.api.util import api_key_required, ssl_required +from collectoss.api.util import api_key_required, ssl_required -from augur.application.db.models import ClientApplication, CollectionStatus, Repo, RepoGroup, BadgingDEI -from augur.application.db.session import DatabaseSession +from collectoss.application.db.models import ClientApplication, CollectionStatus, Repo, RepoGroup, BadgingDEI +from collectoss.application.db.session import DatabaseSession -from augur.tasks.util.collection_util import CollectionRequest,AugurTaskRoutine, get_enabled_phase_names_from_config_session, core_task_success_util -from augur.tasks.start_tasks import prelim_phase, primary_repo_collect_phase -from augur.tasks.github.util.util import get_repo_weight_by_issue +from collectoss.tasks.util.collection_util import CollectionRequest,CollectionTaskRoutine, get_enabled_phase_names_from_config_session, core_task_success_util +from collectoss.tasks.start_tasks import prelim_phase, primary_repo_collect_phase +from collectoss.tasks.github.util.util import get_repo_weight_by_issue from ..server import app logger = logging.getLogger(__name__) -from augur.api.routes import AUGUR_API_VERSION -from augur.application.db.models.augur_operations import FRONTEND_REPO_GROUP_NAME +from collectoss.api.routes import API_VERSION +from collectoss.application.db.models.augur_operations import FRONTEND_REPO_GROUP_NAME -@app.route(f"/{AUGUR_API_VERSION}/dei/repo/add", methods=['POST']) +@app.route(f"/{API_VERSION}/dei/repo/add", methods=['POST']) @ssl_required @api_key_required def dei_track_repo(application: ClientApplication): @@ -95,13 +95,13 @@ def core_task_success_util_gen(repo_git): deiHook = CollectionRequest("core",primary_enabled_phases) deiHook.repo_list = [repo_url] - singleRoutine = AugurTaskRoutine(logger, session,[deiHook]) + singleRoutine = CollectionTaskRoutine(logger, session,[deiHook]) singleRoutine.start_data_collection() #start_block_of_repos(logger, session, [repo_url], primary_enabled_phases, "new") return jsonify({"status": "Success"}) -@app.route(f"/{AUGUR_API_VERSION}/dei/report", methods=['POST']) +@app.route(f"/{API_VERSION}/dei/report", methods=['POST']) @ssl_required @api_key_required def dei_report(application: ClientApplication): @@ -124,7 +124,7 @@ def dei_report(application: ClientApplication): project_id = project.id # Session is now closed - proceed with file operations (no database access needed) - cachePath = Path.cwd() / "augur" / "static" / "cache" + cachePath = Path.cwd() / "collectoss" / "static" / "cache" source = cachePath / f"{project_id}_badging_report.md" report = cachePath / f"{project_id}_badging_report.pdf" diff --git a/collectoss/api/routes/manager.py b/collectoss/api/routes/manager.py new file mode 100755 index 000000000..6e1ea6a81 --- /dev/null +++ b/collectoss/api/routes/manager.py @@ -0,0 +1,23 @@ +#SPDX-License-Identifier: MIT +""" +Creates routes for the manager +""" + + +# TODO: Need to come back and fix this later + +import logging +import time +import requests +import sqlalchemy as s +from sqlalchemy import exc +from flask import request, Response +# from collectoss.config import SystemConfig +import os +import traceback + +from collectoss.api.routes import API_VERSION +from ..server import app + +logger = logging.getLogger(__name__) + diff --git a/augur/api/routes/metadata.py b/collectoss/api/routes/metadata.py similarity index 89% rename from augur/api/routes/metadata.py rename to collectoss/api/routes/metadata.py index ced7b60ef..edd65f595 100644 --- a/augur/api/routes/metadata.py +++ b/collectoss/api/routes/metadata.py @@ -4,10 +4,10 @@ import pandas as pd import json -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION from ..server import app -@app.route('/{}/metadata/repo_info'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/metadata/repo_info'.format(API_VERSION), methods=["GET"]) def get_repo_info(): repo_info_sql = s.sql.text(""" SELECT @@ -48,7 +48,7 @@ def get_repo_info(): status=200, mimetype="application/json") -@app.route('/{}/metadata/contributions_count'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/metadata/contributions_count'.format(API_VERSION), methods=["GET"]) def contributions_count(): repo_info_sql = s.sql.text(""" select repo_git, count(*) as contributions from contributor_repo @@ -64,7 +64,7 @@ def contributions_count(): status=200, mimetype="application/json") -@app.route('/{}/metadata/contributors_count'.format(AUGUR_API_VERSION), methods=["GET"]) +@app.route('/{}/metadata/contributors_count'.format(API_VERSION), methods=["GET"]) def contributors_count(): repo_info_sql = s.sql.text(""" select repo_git, count(distinct(cntrb_id)) as contributors from contributor_repo diff --git a/augur/api/routes/nonstandard_metrics.py b/collectoss/api/routes/nonstandard_metrics.py similarity index 67% rename from augur/api/routes/nonstandard_metrics.py rename to collectoss/api/routes/nonstandard_metrics.py index 8c6b7df1c..b7b44578d 100644 --- a/augur/api/routes/nonstandard_metrics.py +++ b/collectoss/api/routes/nonstandard_metrics.py @@ -4,14 +4,14 @@ import pandas as pd from flask import Response -from augur.api.metrics.repo_meta import license_files -from augur.api.metrics.insight import top_insights +from collectoss.api.metrics.repo_meta import license_files +from collectoss.api.metrics.insight import top_insights -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION from ..server import app, route_transform -@app.route(f"/{AUGUR_API_VERSION}/////license-files") +@app.route(f"/{API_VERSION}/////license-files") def get_license_files(license_id, spdx_binary, repo_group_id, repo_id): arguments = [license_id, spdx_binary, repo_group_id, repo_id] license_files = route_transform(license_files, args=arguments) @@ -19,7 +19,7 @@ def get_license_files(license_id, spdx_binary, repo_group_id, repo_id): status=200, mimetype="application/json") -@app.route(f"/{AUGUR_API_VERSION}/repo-groups//top-insights") +@app.route(f"/{API_VERSION}/repo-groups//top-insights") def top_insights(repo_group_id): data = route_transform(top_insights, args=[repo_group_id]) return Response(response=data, diff --git a/augur/api/routes/user.py b/collectoss/api/routes/user.py similarity index 86% rename from augur/api/routes/user.py rename to collectoss/api/routes/user.py index 3a76aa407..b02d150e7 100644 --- a/augur/api/routes/user.py +++ b/collectoss/api/routes/user.py @@ -2,7 +2,7 @@ """ Creates routes for user functionality """ -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION import logging import secrets @@ -11,19 +11,19 @@ from werkzeug.security import check_password_hash from sqlalchemy.orm import object_session -from augur.application.db import get_session -from augur.api.util import api_key_required -from augur.api.util import ssl_required +from collectoss.application.db import get_session +from collectoss.api.util import api_key_required +from collectoss.api.util import ssl_required -from augur.application.db.models import User, UserSessionToken, RefreshToken -from augur.tasks.init.redis_connection import get_redis_connection +from collectoss.application.db.models import User, UserSessionToken, RefreshToken +from collectoss.tasks.init.redis_connection import get_redis_connection from ..server import app logger = logging.getLogger(__name__) current_user: User = current_user -@app.route(f"/{AUGUR_API_VERSION}/user/validate", methods=['POST']) +@app.route(f"/{API_VERSION}/user/validate", methods=['POST']) @ssl_required def validate_user(): username = request.args.get("username") @@ -48,7 +48,7 @@ def validate_user(): return jsonify({"status": "Validated"}) -@app.route(f"/{AUGUR_API_VERSION}/user/logout", methods=['POST']) +@app.route(f"/{API_VERSION}/user/logout", methods=['POST']) @ssl_required @login_required def logout_user_func(): @@ -58,7 +58,7 @@ def logout_user_func(): return jsonify({"status": "Error when logging out"}) -@app.route(f"/{AUGUR_API_VERSION}/user/authorize", methods=['POST', 'GET']) +@app.route(f"/{API_VERSION}/user/authorize", methods=['POST', 'GET']) @ssl_required @login_required def user_authorize(): @@ -71,7 +71,7 @@ def user_authorize(): return jsonify({"status": "Validated", "code": code}) -@app.route(f"/{AUGUR_API_VERSION}/user/session/generate", methods=['POST']) +@app.route(f"/{API_VERSION}/user/session/generate", methods=['POST']) @ssl_required @api_key_required def generate_session(application): @@ -115,7 +115,7 @@ def generate_session(application): return response -@app.route(f"/{AUGUR_API_VERSION}/user/session/refresh", methods=["GET", "POST"]) +@app.route(f"/{API_VERSION}/user/session/refresh", methods=["GET", "POST"]) @ssl_required @api_key_required def refresh_session(application): @@ -151,7 +151,7 @@ def refresh_session(application): return response -@app.route(f"/{AUGUR_API_VERSION}/user/query", methods=['POST']) +@app.route(f"/{API_VERSION}/user/query", methods=['POST']) @ssl_required def query_user(): username = request.args.get("username") @@ -163,7 +163,7 @@ def query_user(): return jsonify({"status": True}) -@app.route(f"/{AUGUR_API_VERSION}/user/create", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/create", methods=['GET', 'POST']) @ssl_required def create_user(): username = request.args.get("username") @@ -178,7 +178,7 @@ def create_user(): return jsonify(result[1]) -@app.route(f"/{AUGUR_API_VERSION}/user/remove", methods=['POST', 'DELETE']) +@app.route(f"/{API_VERSION}/user/remove", methods=['POST', 'DELETE']) @ssl_required @login_required def delete_user(): @@ -186,7 +186,7 @@ def delete_user(): return jsonify(status) -@app.route(f"/{AUGUR_API_VERSION}/user/update", methods=['POST']) +@app.route(f"/{API_VERSION}/user/update", methods=['POST']) @ssl_required @login_required def update_user(): @@ -222,7 +222,7 @@ def update_user(): return jsonify({"status": "Missing argument"}), 400 -@app.route(f"/{AUGUR_API_VERSION}/user/repo/add", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/repo/add", methods=['GET', 'POST']) @ssl_required @login_required def add_user_repo(): @@ -234,7 +234,7 @@ def add_user_repo(): return jsonify(result[1]) -@app.route(f"/{AUGUR_API_VERSION}/user/group/add", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/group/add", methods=['GET', 'POST']) @ssl_required @login_required def add_user_group(): @@ -244,7 +244,7 @@ def add_user_group(): return jsonify(result[1]) -@app.route(f"/{AUGUR_API_VERSION}/user/group/remove", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/group/remove", methods=['GET', 'POST']) @ssl_required @login_required def remove_user_group(): @@ -255,7 +255,7 @@ def remove_user_group(): return jsonify(result[1]) -@app.route(f"/{AUGUR_API_VERSION}/user/org/add", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/org/add", methods=['GET', 'POST']) @ssl_required @login_required def add_user_org(): @@ -267,7 +267,7 @@ def add_user_org(): return jsonify(result[1]) -@app.route(f"/{AUGUR_API_VERSION}/user/repo/remove", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/repo/remove", methods=['GET', 'POST']) @ssl_required @login_required def remove_user_repo(): @@ -282,7 +282,7 @@ def remove_user_repo(): return jsonify(result[1]) -@app.route(f"/{AUGUR_API_VERSION}/user/group/repos/", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/group/repos/", methods=['GET', 'POST']) @ssl_required @login_required def group_repos(): @@ -325,7 +325,7 @@ def group_repos(): return jsonify(result_dict) -@app.route(f"/{AUGUR_API_VERSION}/user/group/repos/count", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/group/repos/count", methods=['GET', 'POST']) @ssl_required @login_required def group_repo_count(): @@ -353,7 +353,7 @@ def group_repo_count(): return jsonify(result_dict) -@app.route(f"/{AUGUR_API_VERSION}/user/groups/names", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/groups/names", methods=['GET', 'POST']) @ssl_required @login_required def get_user_groups(): @@ -373,7 +373,7 @@ def get_user_groups(): return jsonify({"status": "success", "group_names": result[0]}) -@app.route(f"/{AUGUR_API_VERSION}/user/groups/repos/", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/groups/repos/", methods=['GET', 'POST']) @ssl_required @login_required def get_user_groups_and_repos(): @@ -417,7 +417,7 @@ def get_user_groups_and_repos(): return jsonify({"status": "success", "data": data}) -@app.route(f"/{AUGUR_API_VERSION}/user/group/favorite/toggle", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/group/favorite/toggle", methods=['GET', 'POST']) @ssl_required @login_required def toggle_user_group_favorite(): @@ -434,7 +434,7 @@ def toggle_user_group_favorite(): return jsonify(result[1]) -@app.route(f"/{AUGUR_API_VERSION}/user/groups/favorites", methods=['GET', 'POST']) +@app.route(f"/{API_VERSION}/user/groups/favorites", methods=['GET', 'POST']) @ssl_required @login_required def get_favorite_groups(): diff --git a/augur/api/routes/util.py b/collectoss/api/routes/util.py similarity index 92% rename from augur/api/routes/util.py rename to collectoss/api/routes/util.py index 16d3c4db1..aeaa2e602 100644 --- a/augur/api/routes/util.py +++ b/collectoss/api/routes/util.py @@ -1,5 +1,5 @@ #SPDX-License-Identifier: MIT -from augur.api.routes import AUGUR_API_VERSION +from collectoss.api.routes import API_VERSION from ..server import app import base64 import sqlalchemy as s @@ -7,19 +7,19 @@ import json from flask import Response, current_app, jsonify -from augur.application.db.lib import get_value -from augur.application.logs import AugurLogger +from collectoss.application.db.lib import get_value +from collectoss.application.logs import SystemLogger -logger = AugurLogger("augur").get_logger() +logger = SystemLogger("collectoss").get_logger() @app.route("/api") def get_api_version(): return jsonify({ "status": "up", - "route": AUGUR_API_VERSION + "route": API_VERSION }) -@app.route('/{}/repo-groups'.format(AUGUR_API_VERSION)) +@app.route('/{}/repo-groups'.format(API_VERSION)) def get_all_repo_groups(): #TODO: make this name automatic - wrapper? repoGroupsSQL = s.sql.text(""" SELECT * @@ -34,7 +34,7 @@ def get_all_repo_groups(): #TODO: make this name automatic - wrapper? status=200, mimetype="application/json") -@app.route('/{}/repos'.format(AUGUR_API_VERSION)) +@app.route('/{}/repos'.format(API_VERSION)) def get_all_repos(): get_all_repos_sql = s.sql.text(""" @@ -78,7 +78,7 @@ def get_all_repos(): status=200, mimetype="application/json") -@app.route('/{}/repo-groups//repos'.format(AUGUR_API_VERSION)) +@app.route('/{}/repo-groups//repos'.format(API_VERSION)) def get_repos_in_repo_group(repo_group_id): repos_in_repo_groups_SQL = s.sql.text(""" SELECT @@ -114,7 +114,7 @@ def get_repos_in_repo_group(repo_group_id): status=200, mimetype="application/json") -@app.route('/{}/repos/'.format(AUGUR_API_VERSION)) +@app.route('/{}/repos/'.format(API_VERSION)) def get_repo_by_id(repo_id: int) -> Response: repo_by_id_SQL = s.sql.text(""" SELECT @@ -157,7 +157,7 @@ def get_repo_by_id(repo_id: int) -> Response: status=200, mimetype="application/json") -@app.route('/{}/owner//repo/'.format(AUGUR_API_VERSION)) +@app.route('/{}/owner//repo/'.format(API_VERSION)) def get_repo_by_git_name(owner, repo): get_repo_by_git_name_sql = s.sql.text(""" @@ -174,7 +174,7 @@ def get_repo_by_git_name(owner, repo): status=200, mimetype="application/json") -@app.route('/{}/rg-name//repo-name/'.format(AUGUR_API_VERSION)) +@app.route('/{}/rg-name//repo-name/'.format(API_VERSION)) def get_repo_by_name(rg_name, repo_name): get_repo_by_name_sql = s.sql.text(""" @@ -193,7 +193,7 @@ def get_repo_by_name(rg_name, repo_name): status=200, mimetype="application/json") -@app.route('/{}/rg-name/'.format(AUGUR_API_VERSION)) +@app.route('/{}/rg-name/'.format(API_VERSION)) def get_group_by_name(rg_name): groupSQL = s.sql.text(""" SELECT repo_group_id, rg_name @@ -208,7 +208,7 @@ def get_group_by_name(rg_name): status=200, mimetype="application/json") -@app.route('/{}/dosocs/repos'.format(AUGUR_API_VERSION)) +@app.route('/{}/dosocs/repos'.format(API_VERSION)) def get_repos_for_dosocs(): get_repos_for_dosocs_SQL = s.sql.text(""" SELECT b.repo_id, CONCAT(a.value || b.repo_group_id || chr(47) || b.repo_path || b.repo_name) AS path @@ -223,8 +223,8 @@ def get_repos_for_dosocs(): status=200, mimetype='application/json') -@app.route('/{}/repo-groups//get-issues'.format(AUGUR_API_VERSION)) -@app.route('/{}/repos//get-issues'.format(AUGUR_API_VERSION)) +@app.route('/{}/repo-groups//get-issues'.format(API_VERSION)) +@app.route('/{}/repos//get-issues'.format(API_VERSION)) def get_issues(repo_group_id, repo_id=None): if not repo_id: get_issues_sql = s.sql.text(""" @@ -275,7 +275,7 @@ def get_issues(repo_group_id, repo_id=None): status=200, mimetype='application/json') -@app.route('/{}/api-port'.format(AUGUR_API_VERSION)) +@app.route('/{}/api-port'.format(API_VERSION)) def api_port(): response = {'port': get_value('Server', 'port')} diff --git a/augur/api/server.py b/collectoss/api/server.py similarity index 91% rename from augur/api/server.py rename to collectoss/api/server.py index e66228c51..2c71dfcd1 100644 --- a/augur/api/server.py +++ b/collectoss/api/server.py @@ -1,5 +1,5 @@ #SPDX-License-Identifier: MIT -"""Creates a WSGI server that serves the Augur REST API.""" +"""Creates a WSGI server that serves the CollectOSS REST API.""" import glob import sys @@ -24,18 +24,18 @@ from graphene_sqlalchemy import SQLAlchemyObjectType -from augur.application.logs import AugurLogger -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig -from augur.application.db.engine import get_database_string, create_database_engine -from augur.application.db.models import Repo, Issue, PullRequest, Message, PullRequestReview, Commit, IssueAssignee, PullRequestAssignee, PullRequestCommit, PullRequestFile, Contributor, IssueLabel, PullRequestLabel, ContributorsAlias, Release, ClientApplication +from collectoss.application.logs import SystemLogger +from collectoss.application.db.session import DatabaseSession +from collectoss.application.config import SystemConfig +from collectoss.application.db.engine import get_database_string, create_database_engine +from collectoss.application.db.models import Repo, Issue, PullRequest, Message, PullRequestReview, Commit, IssueAssignee, PullRequestAssignee, PullRequestCommit, PullRequestFile, Contributor, IssueLabel, PullRequestLabel, ContributorsAlias, Release, ClientApplication -from metadata import __version__ as augur_code_version +from metadata import __version__ as code_version -# from augur.api.routes import AUGUR_API_VERSION -AUGUR_API_VERSION = "api/unstable" +# from collectoss.api.routes import API_VERSION +API_VERSION = "api/unstable" show_metadata = False @@ -47,7 +47,7 @@ def get_file_id(path: str) -> str: path: file path Examples: - If the path /augur/best_routes.py is given it will return "best_routes" + If the path /collectoss/best_routes.py is given it will return "best_routes" Returns: the filename as a string @@ -62,8 +62,8 @@ def create_metrics() -> None: # import the metric modules and add them to the flask app using add_metrics for file in metric_files: - importlib.import_module(f"augur.api.metrics.{file}") - add_metrics(f"augur.api.metrics.{file}") + importlib.import_module(f"collectoss.api.metrics.{file}") + add_metrics(f"collectoss.api.metrics.{file}") def add_metrics(module_name: str) -> None: @@ -75,7 +75,7 @@ def add_metrics(module_name: str) -> None: Note: The attribute is_metric and obj.metadata['type'] - are set in file augur/api/routes/util.py in the function + are set in file collectoss/api/routes/util.py in the function register_metric(). This function is a decorator and is how a function is defined as a metric. @@ -102,14 +102,14 @@ def add_metrics(module_name: str) -> None: def get_metric_files() -> List[str]: - """Get list of all the metrics files in the augur/api/metrics directory, + """Get list of all the metrics files in the collectoss/api/metrics directory, Returns: list of file names """ metric_files = [] - for filename in glob.iglob("augur/api/metrics/**"): + for filename in glob.iglob("collectoss/api/metrics/**"): file_id = get_file_id(filename) # this filters out files like __init__ and __pycache__. And makes sure it only get py files @@ -263,9 +263,9 @@ def add_standard_metric(function: Any, endpoint: str) -> None: function: the function that needs to be mapped to the routes endpoint: the path that the endpoint should be defined as """ - repo_endpoint = f'/{app.augur_api_version}/repos//{endpoint}' - repo_group_endpoint = f'/{app.augur_api_version}/repo-groups//{endpoint}' - deprecated_repo_endpoint = f'/{app.augur_api_version}/repo-groups//repos//{endpoint}' + repo_endpoint = f'/{app.api_version}/repos//{endpoint}' + repo_group_endpoint = f'/{app.api_version}/repo-groups//{endpoint}' + deprecated_repo_endpoint = f'/{app.api_version}/repo-groups//repos//{endpoint}' # These three lines are defining routes on the flask app, and passing a function. @@ -287,7 +287,7 @@ def add_toss_metric(function: Any, endpoint: str) -> None: function: the function that needs to be mapped to the routes endpoint: the path that the endpoint should be defined as """ - repo_endpoint = f'/{app.augur_api_version}/repos//{endpoint}' + repo_endpoint = f'/{app.api_version}/repos//{endpoint}' app.route(repo_endpoint)(routify(function, 'repo')) def create_cache_manager() -> CacheManager: @@ -320,18 +320,18 @@ def get_server_cache(cache_manager) -> Cache: server cache """ - expire = int(augur_config.get_value('Server', 'cache_expire')) + expire = int(system_config.get_value('Server', 'cache_expire')) server_cache = cache_manager.get_cache('server', expire=expire) server_cache.clear() return server_cache -logger = AugurLogger("server").get_logger() +logger = SystemLogger("server").get_logger() url = get_database_string() engine = create_database_engine(url, poolclass=StaticPool) db_session = DatabaseSession(logger, engine) -augur_config = AugurConfig(logger, db_session) +system_config = SystemConfig(logger, db_session) def get_connection(table, cursor_field_name, connection_class, after, limit, extra_condition=False): @@ -671,7 +671,7 @@ def resolve_contributor(self, info, id): # defines the api version on the flask app, # so when we pass the flask app to the routes files we # know can access the api version via the app variable -app.augur_api_version = AUGUR_API_VERSION +app.api_version = API_VERSION app.engine = engine CORS(app) @@ -690,17 +690,17 @@ def index(): """ Redirects to health check route """ - return redirect(app.augur_api_version) + return redirect(app.api_version) -@app.route(f'/{app.augur_api_version}/') -@app.route(f'/{app.augur_api_version}/status') +@app.route(f'/{app.api_version}/') +@app.route(f'/{app.api_version}/status') def status(): """ Health check route """ status = { 'status': 'OK', - 'version': augur_code_version + 'version': code_version } return Response(response=json.dumps(status), status=200, @@ -723,12 +723,12 @@ def dispatch_request(self): schema = graphene.Schema(query=Query) -app.add_url_rule(f'/{app.augur_api_version}/graphql', view_func=AuthenticatedGraphQLView.as_view('graphql', schema=schema, graphiql=True)) +app.add_url_rule(f'/{app.api_version}/graphql', view_func=AuthenticatedGraphQLView.as_view('graphql', schema=schema, graphiql=True)) from .routes import * # import frontend routes -from .view.augur_view import * +from .view.api_view import * from .view.routes import * from .view.api import * diff --git a/augur/api/ssl/.gitignore b/collectoss/api/ssl/.gitignore similarity index 100% rename from augur/api/ssl/.gitignore rename to collectoss/api/ssl/.gitignore diff --git a/augur/api/ssl/README.md b/collectoss/api/ssl/README.md similarity index 62% rename from augur/api/ssl/README.md rename to collectoss/api/ssl/README.md index dbe14fa30..10c3c9597 100644 --- a/augur/api/ssl/README.md +++ b/collectoss/api/ssl/README.md @@ -13,9 +13,9 @@ Then `chmod youruser *` and Enabling HTTPS -------------------- -HTTPS is an extension of HTTP. It is used for secure communications over a computer networks by encrypting your data so it is not vulnerable to MIM(Man-in-the-Middle) attacks etc. While Augur's API data might not be very sensitive, it would still be a nice feature to have so something can't interfere and provide wrong data. Additionally, the user may not feel very comfortable using an application when the browser is telling the user it is not secure. Features such as logins is an example of information that would be particularly vulnerable to attacks. Lastly, search engine optimization actually favors applications on HTTPS over HTTP. +HTTPS is an extension of HTTP. It is used for secure communications over a computer networks by encrypting your data so it is not vulnerable to MIM(Man-in-the-Middle) attacks etc. While CollectOSS's API data might not be very sensitive, it would still be a nice feature to have so something can't interfere and provide wrong data. Additionally, the user may not feel very comfortable using an application when the browser is telling the user it is not secure. Features such as logins is an example of information that would be particularly vulnerable to attacks. Lastly, search engine optimization actually favors applications on HTTPS over HTTP. -This guide will start on a fully configured EC2 Ubuntu 20.04 instance, meaning it is assumed to already have Augur installed and running with all of its dependencies(PostgreSQL, Nginx, etc). +This guide will start on a fully configured EC2 Ubuntu 20.04 instance, meaning it is assumed to already have CollectOSS installed and running with all of its dependencies(PostgreSQL, Nginx, etc). ~~~~~~~~~~~~~~~~~~~~~ Let's Encrypt/Certbot @@ -27,4 +27,4 @@ The easiest way to get an HTTPS server up is to make use of `Let's Encrypt ') -def wait_for_report_request(id): - requestReports(id) - return jsonify(report_requests[id]) diff --git a/augur/api/view/augur_view.py b/collectoss/api/view/api_view.py similarity index 90% rename from augur/api/view/augur_view.py rename to collectoss/api/view/api_view.py index ff4b25145..b2f5a2925 100644 --- a/augur/api/view/augur_view.py +++ b/collectoss/api/view/api_view.py @@ -7,9 +7,9 @@ # from .server import User from ..server import app, db_session -from augur.application.db.models import User, UserSessionToken -from augur.api.routes import AUGUR_API_VERSION -from augur.api.util import get_bearer_token +from collectoss.application.db.models import User, UserSessionToken +from collectoss.api.routes import API_VERSION +from collectoss.api.util import get_bearer_token import time, traceback @@ -26,21 +26,21 @@ # Code 404 response page, for pages not found @app.errorhandler(404) def page_not_found(error): - if AUGUR_API_VERSION in str(request.path): + if API_VERSION in str(request.path): return jsonify({"status": "Not Found"}), 404 return render_template('index.j2', title='404'), 404 @app.errorhandler(405) def unsupported_method(error): - if AUGUR_API_VERSION in str(request.path): + if API_VERSION in str(request.path): return jsonify({"status": "Unsupported method"}), 405 return render_message("405 - Method not supported", "The resource you are trying to access does not support the request method used"), 405 @app.errorhandler(500) def internal_server_error(error): - if AUGUR_API_VERSION in str(request.path): + if API_VERSION in str(request.path): return jsonify({"status": error.original_exception}), 500 error = error.original_exception try: @@ -57,7 +57,7 @@ def internal_server_error(error): @login_manager.unauthorized_handler def unauthorized(): - if AUGUR_API_VERSION in str(request.path): + if API_VERSION in str(request.path): token_str = get_bearer_token() token = db_session.query(UserSessionToken).filter(UserSessionToken.token == token_str).first() if not token: diff --git a/augur/api/view/init.py b/collectoss/api/view/init.py similarity index 79% rename from augur/api/view/init.py rename to collectoss/api/view/init.py index 173713135..ab4708793 100644 --- a/augur/api/view/init.py +++ b/collectoss/api/view/init.py @@ -1,7 +1,7 @@ import os from pathlib import Path from .server import Environment -from augur.application.logs import AugurLogger +from collectoss.application.logs import SystemLogger import secrets, yaml env = Environment() @@ -9,7 +9,6 @@ # load configuration files and initialize globals configFile = Path(env.setdefault("CONFIG_LOCATION", "config.yml")) -report_requests = {} settings = {} def init_settings(): @@ -17,7 +16,7 @@ def init_settings(): settings["approot"] = "/" settings["caching"] = "static/cache/" settings["cache_expiry"] = 604800 - settings["serving"] = "http://augur.chaoss.io/api/unstable" + settings["serving"] = "http://example.com/api/unstable" settings["pagination_offset"] = 25 settings["session_key"] = secrets.token_hex() @@ -34,4 +33,4 @@ def write_settings(current_settings): # Initialize logging def init_logging(): global logger - logger = AugurLogger("augur_view", reset_logfiles=False).get_logger() + logger = SystemLogger("api_view", reset_logfiles=False).get_logger() diff --git a/augur/api/view/routes.py b/collectoss/api/view/routes.py similarity index 96% rename from augur/api/view/routes.py rename to collectoss/api/view/routes.py index 91d23531b..15ab991b3 100644 --- a/augur/api/view/routes.py +++ b/collectoss/api/view/routes.py @@ -1,5 +1,5 @@ """ -Defines the api routes for the augur views +Defines the api routes for the collectoss views """ import logging import math @@ -7,10 +7,10 @@ from .utils import * from flask_login import login_user, logout_user, current_user, login_required -from augur.application.db.models import User, Repo, ClientApplication +from collectoss.application.db.models import User, Repo, ClientApplication from .server import LoginException -from augur.application.util import * -from augur.application.db.lib import get_value +from collectoss.application.util import * +from collectoss.application.db.lib import get_value from ..server import app, db_session logger = logging.getLogger(__name__) @@ -31,15 +31,15 @@ def root(path=""): """ ---------------------------------------------------------------- logo: this route returns a redirect to the application logo associated - with the provided brand, otherwise the inverted Augur logo if no + with the provided brand, otherwise the inverted CollectOSS logo if no brand is provided. """ @app.route('/logo/') @app.route('/logo/') def logo(brand=None): if brand is None: - return redirect(url_for('static', filename='img/augur_logo.png')) - if "augur" in brand: + return redirect(url_for('static', filename='img/collectoss_logo.png')) + if "collectoss" in brand: return logo(None) if "chaoss" in brand: return redirect(url_for('static', filename='img/Chaoss_Logo_white.png')) diff --git a/augur/api/view/run.sh b/collectoss/api/view/run.sh similarity index 100% rename from augur/api/view/run.sh rename to collectoss/api/view/run.sh diff --git a/augur/api/view/server/Environment.py b/collectoss/api/view/server/Environment.py similarity index 100% rename from augur/api/view/server/Environment.py rename to collectoss/api/view/server/Environment.py diff --git a/augur/api/view/server/LoginException.py b/collectoss/api/view/server/LoginException.py similarity index 100% rename from augur/api/view/server/LoginException.py rename to collectoss/api/view/server/LoginException.py diff --git a/augur/api/view/server/__init__.py b/collectoss/api/view/server/__init__.py similarity index 100% rename from augur/api/view/server/__init__.py rename to collectoss/api/view/server/__init__.py diff --git a/augur/api/view/url_converters.py b/collectoss/api/view/url_converters.py similarity index 100% rename from augur/api/view/url_converters.py rename to collectoss/api/view/url_converters.py diff --git a/augur/api/view/utils.py b/collectoss/api/view/utils.py similarity index 98% rename from augur/api/view/utils.py rename to collectoss/api/view/utils.py index dbfdd1b12..a48979946 100644 --- a/augur/api/view/utils.py +++ b/collectoss/api/view/utils.py @@ -1,12 +1,12 @@ """ -Defines utility functions used by the augur api views +Defines utility functions used by the collectoss api views """ from pathlib import Path from concurrent.futures import ThreadPoolExecutor from flask import render_template, flash, url_for from .init import init_logging from .init import * -from augur.application.db.lib import get_value +from collectoss.application.db.lib import get_value import urllib.error, math, yaml, urllib3, time, math @@ -41,7 +41,7 @@ def loadSettings(): # # Ensure that the cache directory exists and is valid # cachePath = Path(settings["caching"]) - cachePath = Path.cwd() / "augur" / "static" / "cache" + cachePath = Path.cwd() / "collectoss" / "static" / "cache" if not cachePath.is_dir(): if cachePath.is_file(): diff --git a/augur/application/__init__.py b/collectoss/application/__init__.py similarity index 80% rename from augur/application/__init__.py rename to collectoss/application/__init__.py index 9091d6232..3bbade578 100644 --- a/augur/application/__init__.py +++ b/collectoss/application/__init__.py @@ -2,7 +2,7 @@ def requires_db_session(logger): def inner_decorator(fun): def wrapper(*args, **kwargs): - from augur.application.db.session import DatabaseSession + from collectoss.application.db.session import DatabaseSession # create DB session with DatabaseSession(logger) as session: diff --git a/augur/application/cli/__init__.py b/collectoss/application/cli/__init__.py similarity index 68% rename from augur/application/cli/__init__.py rename to collectoss/application/cli/__init__.py index e68af307b..8081d6a8e 100644 --- a/augur/application/cli/__init__.py +++ b/collectoss/application/cli/__init__.py @@ -8,40 +8,48 @@ import httpx import traceback -from augur.application.db.engine import DatabaseEngine -from augur.application.db import get_engine, dispose_database_engine +from collectoss.application.db.engine import DatabaseEngine +from collectoss.application.db import get_engine, dispose_database_engine from sqlalchemy.exc import OperationalError -def test_connection(function_internet_connection): - @click.pass_context - def new_func(ctx, *args, **kwargs): - usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0] - success = False - with httpx.Client() as client: +def check_connectivity(urls=["http://chaoss.community", "http://github.com", "http://gitlab.com"], timeout=10.0): + """ + Checks connectivity against a list of URLs. + Returns True if at least one URL is reachable, False otherwise. + """ + with httpx.Client(timeout=timeout/len(urls), follow_redirects=True) as client: + for i, url in enumerate(urls): try: - _ = client.request( - method="GET", url="http://chaoss.community", timeout=10, follow_redirects=True) - success = True + # Use HEAD request for speed if the server supports it, + # otherwise stick with GET. + response = client.request("GET", url) + if response.is_success: + return True except (TimeoutError, httpx.TimeoutException): - print("Request timed out.") + print(f"Connectivity test Request {i} timed out.") except httpx.NetworkError as e: - print(f"Network Error: {httpx.NetworkError}") + print(f"Connectivity test Request {i} Network Error: {httpx.NetworkError}") print(traceback.format_exc()) except httpx.ProtocolError as e: - print(f"Protocol Error: {httpx.ProtocolError}") + print(f"Connectivity test Request {i} Protocol Error: {httpx.ProtocolError}") print(traceback.format_exc()) + return False + +def test_connection(function_internet_connection): + @click.pass_context + def new_func(ctx, *args, **kwargs): + usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0] + if not check_connectivity(): + print( + f""" + \n\n{usage} command setup failed. + There was an error while testing for network connectivity + Please check your connection to the internet to run CollectOSS + Consider setting http_proxy variables for limited access installations.""" + ) + sys.exit(-1) - if not success: - print( - f""" - \n\n{usage} command setup failed. - There was an error while testing for network connectivity - Please check your connection to the internet to run Augur - Consider setting http_proxy variables for limited access installations.""" - ) - sys.exit(-1) - return ctx.invoke(function_internet_connection, *args, **kwargs) return update_wrapper(new_func, function_internet_connection) @@ -57,10 +65,10 @@ def new_func(ctx, *args, **kwargs): return ctx.invoke(function_db_connection, *args, **kwargs) except OperationalError as e: - augur_db_environment_var = os.getenv("AUGUR_DB") + db_environment_var = os.getenv("AUGUR_DB") # determine the location to print in error string - if augur_db_environment_var: + if db_environment_var: location = f"the AUGUR_DB environment variable\nAUGUR_DB={os.getenv('AUGUR_DB')}" else: with open("db.config.json", 'r') as f: @@ -121,17 +129,3 @@ def new_func(ctx, *args, **kwargs): # ctx.obj = Application(offline_mode=True).config # return ctx.invoke(f, ctx.obj, *args, **kwargs) # return update_wrapper(new_func, f) - -# def pass_logs_dir(f): -# @click.pass_context -# def new_func(ctx, *args, **kwargs): -# config = AugurConfig(ROOT_AUGUR_DIRECTORY) -# ctx.obj = AugurLogging.get_log_directories(config, reset_logfiles=False) -# return ctx.invoke(f, ctx.obj, *args, **kwargs) -# return update_wrapper(new_func, f) - -# def initialize_logging(f): -# def new_func(*args, **kwargs): -# AugurLogging(reset_logfiles=False) -# return f(*args, **kwargs) -# return update_wrapper(new_func, f) \ No newline at end of file diff --git a/augur/application/cli/_cli_util.py b/collectoss/application/cli/_cli_util.py similarity index 90% rename from augur/application/cli/_cli_util.py rename to collectoss/application/cli/_cli_util.py index 6ab969c6f..0e1a7e1aa 100644 --- a/augur/application/cli/_cli_util.py +++ b/collectoss/application/cli/_cli_util.py @@ -5,13 +5,13 @@ import signal from urllib.parse import urlparse -from augur.tasks.init.redis_connection import get_redis_connection +from collectoss.tasks.init.redis_connection import get_redis_connection def clear_redis_caches(logger): """Clears the redis databases that celery and redis use.""" logger.info("Flushing all redis databases this instance was using") - celery_purge_command = "celery -A augur.tasks.init.celery_app.celery_app purge -f" + celery_purge_command = "celery -A collectoss.tasks.init.celery_app.celery_app purge -f" subprocess.call(celery_purge_command.split(" ")) @@ -23,7 +23,7 @@ def clear_rabbitmq_messages(connection_string, queues, logger): #virtual_host_string = connection_string.split("/")[-1] logger.info("Clearing all messages from celery queue in rabbitmq") - from augur.tasks.init.celery_app import celery_app + from collectoss.tasks.init.celery_app import celery_app celery_app.control.purge() clear_message_queues(connection_string, queues) diff --git a/augur/application/cli/csv_utils.py b/collectoss/application/cli/_csv_utils.py similarity index 98% rename from augur/application/cli/csv_utils.py rename to collectoss/application/cli/_csv_utils.py index e55835f6e..cce98aba4 100644 --- a/augur/application/cli/csv_utils.py +++ b/collectoss/application/cli/_csv_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT """ -CSV processing utilities for Augur CLI +CSV processing utilities for CollectOSS CLI """ import csv import logging @@ -25,7 +25,7 @@ def validate_git_url(value: str) -> bool: Returns: True if the value is a valid GitHub or GitLab URL, False otherwise """ - from augur.application.db.models import Repo + from collectoss.application.db.models import Repo value = value.strip() github_parse = Repo.parse_github_repo_url(value) diff --git a/augur/application/cli/_multicommand.py b/collectoss/application/cli/_multicommand.py similarity index 66% rename from augur/application/cli/_multicommand.py rename to collectoss/application/cli/_multicommand.py index 19392b274..13186e7bb 100644 --- a/augur/application/cli/_multicommand.py +++ b/collectoss/application/cli/_multicommand.py @@ -1,6 +1,6 @@ #SPDX-License-Identifier: MIT """ -Runs Augur with Gunicorn when called +Runs CollectOSS with Gunicorn when called """ import os @@ -9,11 +9,11 @@ import traceback from pathlib import Path -# import augur.application +# import collectoss.application CONTEXT_SETTINGS = dict(auto_envvar_prefix='AUGUR') -class AugurMultiCommand(click.MultiCommand): +class CLIMultiCommand(click.MultiCommand): def __commands_folder(self): return os.path.abspath(os.path.dirname(__file__)) @@ -26,20 +26,20 @@ def list_commands(self, ctx): return rv def get_command(self, ctx, name): - cmdfile = "augur/application/cli" / Path(name + ".py") + cmdfile = Path(self.__commands_folder()).joinpath(name + ".py") # Check that the command exists before importing if not cmdfile.is_file(): return # Prefer to raise exception instead of silcencing it - module = importlib.import_module('.' + name, 'augur.application.cli') + module = importlib.import_module('.' + name, 'collectoss.application.cli') return module.cli -@click.command(cls=AugurMultiCommand, context_settings=CONTEXT_SETTINGS) +@click.command(cls=CLIMultiCommand, context_settings=CONTEXT_SETTINGS) @click.pass_context def run(ctx): """ - Augur is an application for open source community health analytics + CollectOSS is an application for open source community health analytics """ return ctx diff --git a/augur/application/cli/api.py b/collectoss/application/cli/api.py similarity index 59% rename from augur/application/cli/api.py rename to collectoss/application/cli/api.py index 50044de7c..a8bb9e53b 100644 --- a/augur/application/cli/api.py +++ b/collectoss/application/cli/api.py @@ -1,6 +1,6 @@ #SPDX-License-Identifier: MIT """ -Augur library commands for controlling the backend components +CollectOSS library commands for controlling the backend components """ import os import time @@ -12,13 +12,13 @@ import uuid import traceback -from augur.application.db.session import DatabaseSession -from augur.application.logs import AugurLogger -from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext -from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages -from augur.application.db.lib import get_value +from collectoss.application.db.session import DatabaseSession +from collectoss.application.logs import SystemLogger +from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext +from collectoss.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages +from collectoss.application.db.lib import get_value -logger = AugurLogger("augur", reset_logfiles=False).get_logger() +logger = SystemLogger("collectoss", reset_logfiles=False).get_logger() @click.group('api', short_help='Commands for controlling the backend API server') @click.pass_context @@ -33,7 +33,7 @@ def cli(ctx): @with_database @click.pass_context def start(ctx, development, port): - """Start Augur's backend server.""" + """Start CollectOSS's backend server.""" try: if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": @@ -50,23 +50,23 @@ def start(ctx, development, port): logger.info("Starting in development mode") try: - gunicorn_location = os.getcwd() + "/augur/api/gunicorn_conf.py" + gunicorn_location = os.getcwd() + "/collectoss/api/gunicorn_conf.py" except FileNotFoundError: - logger.error("\n\nPlease run augur commands in the root directory\n\n") + logger.error("\n\nPlease run collectoss commands in the root directory\n\n") host = get_value("Server", "host") if not port: port = get_value("Server", "port") - gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log" + gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} collectoss.api.server:app --log-file gunicorn.log" server = subprocess.Popen(gunicorn_command.split(" ")) time.sleep(3) logger.info('Gunicorn webserver started...') - logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') + logger.info(f'CollectOSS is running at: {"http" if development else "https"}://{host}:{port}') - frontend_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" + frontend_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" frontend_worker_process = subprocess.Popen(frontend_worker.split(" ")) try: @@ -86,39 +86,36 @@ def start(ctx, development, port): @click.pass_context def stop(ctx): """ - Sends SIGTERM to all Augur api processes + Sends SIGTERM to all CollectOSS api processes """ - logger = logging.getLogger("augur.cli") + logger = logging.getLogger("collectoss.cli") - augur_stop(signal.SIGTERM, logger, ctx.obj.engine) + stop_processes(signal.SIGTERM, logger, ctx.obj.engine) @cli.command('kill') @with_database @click.pass_context def kill(ctx): """ - Sends SIGKILL to all Augur api processes + Sends SIGKILL to all CollectOSS api processes """ - logger = logging.getLogger("augur.cli") - augur_stop(signal.SIGKILL, logger, ctx.obj.engine) + logger = logging.getLogger("collectoss.cli") + stop_processes(signal.SIGKILL, logger, ctx.obj.engine) @cli.command('processes') def processes(): """ - Outputs the name/PID of all Augur api process""" - augur_processes = get_augur_api_processes() - for process in augur_processes: + Outputs the name/PID of all CollectOSS api process""" + for process in get_api_processes(): logger.info(f"Found process {process.pid}") -def augur_stop(signal, logger, engine): +def stop_processes(signal, logger, engine): """ - Stops augur with the given signal, + Stops collectoss with the given signal, and cleans up the api """ - augur_processes = get_augur_api_processes() - - _broadcast_signal_to_processes(augur_processes, logger=logger, broadcast_signal=signal) + _broadcast_signal_to_processes(get_api_processes(), logger=logger, broadcast_signal=signal) cleanup_after_api_halt(logger, engine) @@ -131,16 +128,16 @@ def cleanup_after_api_halt(logger, engine): clear_rabbitmq_messages(connection_string, queues, logger) clear_redis_caches(logger) -def get_augur_api_processes(): - augur_api_processes = [] +def get_api_processes(): + api_processes = [] for process in psutil.process_iter(['cmdline', 'name', 'environ']): if process.info['cmdline'] is not None and process.info['environ'] is not None: try: if is_api_process(process): - augur_api_processes.append(process) + api_processes.append(process) except (KeyError, FileNotFoundError): pass - return augur_api_processes + return api_processes def is_api_process(process): @@ -149,9 +146,9 @@ def is_api_process(process): if process.pid != os.getpid(): - if ("augur.api.server:app" in command or - "augurbackendapi" in command or - ("augur.tasks.init.celery_app.celery_app" in command and "frontend" in command)): + if ("collectoss.api.server:app" in command or + "collectossbackendapi" in command or + ("collectoss.tasks.init.celery_app.celery_app" in command and "frontend" in command)): return True return False diff --git a/augur/application/cli/backend.py b/collectoss/application/cli/backend.py similarity index 56% rename from augur/application/cli/backend.py rename to collectoss/application/cli/backend.py index 341df8886..a07ddf198 100644 --- a/augur/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -1,6 +1,6 @@ #SPDX-License-Identifier: MIT """ -Augur library commands for controlling the backend components +CollectOSS library commands for controlling the backend components """ import resource import os @@ -14,28 +14,26 @@ import traceback import requests from redis.exceptions import ConnectionError as RedisConnectionError -from urllib.parse import urlparse - -from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records -from augur.tasks.git.facade_tasks import clone_repos -from augur.tasks.github.contributors import process_contributors -from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler -from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler -from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model -from augur.tasks.init.redis_connection import get_redis_connection -from augur.application.db.models import UserRepo -from augur.application.db.session import DatabaseSession -from augur.application.logs import AugurLogger -from augur.application.db.lib import get_value -from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext + +from collectoss.tasks.start_tasks import collection_monitor, create_collection_status_records +from collectoss.tasks.git.facade_tasks import clone_repos +from collectoss.tasks.github.contributors import process_contributors +from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from collectoss.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler +from collectoss.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model +from collectoss.application.db.models import UserRepo +from collectoss.application.db.session import DatabaseSession +from collectoss.application.logs import SystemLogger +from collectoss.application.service_manager import SystemServiceManager, cleanup_collection_status_and_rabbit, clean_collection_status +from collectoss.application.db.lib import get_value +from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = os.getenv("AUGUR_RESET_LOGS", 'True').lower() in ('true', '1', 't', 'y', 'yes') -logger = AugurLogger("augur", reset_logfiles=reset_logs).get_logger() - +logger = SystemLogger("collectoss", reset_logfiles=reset_logs).get_logger() @click.group('server', short_help='Commands for controlling the backend API server & data collection workers') @click.pass_context @@ -52,20 +50,26 @@ def cli(ctx): @with_database @click.pass_context def start(ctx, disable_collection, development, pidfile, port): - """Start Augur's backend server.""" + """Start CollectOSS's backend server.""" with open(pidfile, "w") as pidfile_io: pidfile_io.write(str(os.getpid())) - + + manager = SystemServiceManager(ctx, pidfile, disable_collection) + + # Register signal handlers for graceful shutdown + signal.signal(signal.SIGTERM, manager.shutdown_signal_handler) + signal.signal(signal.SIGINT, manager.shutdown_signal_handler) + try: if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) - except Exception as e: + except Exception as e: logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) - + logger.error("Failed to raise open file limit!") raise e - + if development: os.environ["AUGUR_DEV"] = "1" logger.info("Starting in development mode") @@ -73,9 +77,9 @@ def start(ctx, disable_collection, development, pidfile, port): os.environ["AUGUR_PIDFILE"] = pidfile try: - gunicorn_location = os.getcwd() + "/augur/api/gunicorn_conf.py" + gunicorn_location = os.getcwd() + "/collectoss/api/gunicorn_conf.py" except FileNotFoundError: - logger.error("\n\nPlease run augur commands in the root directory\n\n") + logger.error("\n\nPlease run collectoss commands in the root directory\n\n") host = get_value("Server", "host") @@ -99,8 +103,9 @@ def start(ctx, disable_collection, development, pidfile, port): log_dir = get_value("Logging", "logs_directory") or "." gunicorn_log_file = os.path.join(log_dir, "gunicorn.log") - gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file {gunicorn_log_file}" + gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} collectoss.api.server:app --log-file {gunicorn_log_file}" server = subprocess.Popen(gunicorn_command.split(" ")) + manager.server = server logger.info("awaiting Gunicorn start") while not server.poll(): @@ -119,10 +124,11 @@ def start(ctx, disable_collection, development, pidfile, port): exit(247) logger.info('Gunicorn webserver started...') - logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') + logger.info(f'CollectOSS is running at: {"http" if development else "https"}://{host}:{port}') logger.info(f"The API is available at '{api_response.json()['route']}'") processes = start_celery_worker_processes((core_worker_count, secondary_worker_count, facade_worker_count), disable_collection) + manager.processes = processes celery_beat_schedule_db = os.getenv("CELERYBEAT_SCHEDULE_DB", "celerybeat-schedule.db") if os.path.exists(celery_beat_schedule_db): @@ -131,9 +137,11 @@ def start(ctx, disable_collection, development, pidfile, port): log_level = get_value("Logging", "log_level") celery_beat_process = None - celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()} -s {celery_beat_schedule_db}" - celery_beat_process = subprocess.Popen(celery_command.split(" ")) + celery_command = f"celery -A collectoss.tasks.init.celery_app.celery_app beat -l {log_level.lower()} -s {celery_beat_schedule_db}" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) + manager.celery_beat_process = celery_beat_process keypub = KeyPublisher() + manager.keypub = keypub if not disable_collection: if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": @@ -167,12 +175,12 @@ def start(ctx, disable_collection, development, pidfile, port): #put contributor breadth back in. Not sure why it was commented out contributor_breadth_model.si().apply_async() - # start cloning repos when augur starts + # start cloning repos when collectoss starts clone_repos.si().apply_async() process_contributors.si().apply_async() - augur_collection_monitor.si().apply_async() + collection_monitor.si().apply_async() else: logger.info("Collection disabled") @@ -180,29 +188,15 @@ def start(ctx, disable_collection, development, pidfile, port): try: server.wait() except KeyboardInterrupt: - - if server: - logger.info("Shutting down server") - server.terminate() - - logger.info("Shutting down all celery worker processes") - for p in processes: - if p: - p.terminate() - - if celery_beat_process: - logger.info("Shutting down celery beat process") - celery_beat_process.terminate() - - if not disable_collection: - + # Signal handler will take care of cleanup + pass + finally: + # Ensure pidfile is cleaned up if we exit normally + if os.path.exists(pidfile): try: - keypub.shutdown() - cleanup_collection_status_and_rabbit(logger, ctx.obj.engine) - except RedisConnectionError: - pass - - os.unlink(pidfile) + os.unlink(pidfile) + except OSError as e: + logger.error(f"Could not remove pidfile {pidfile}: {e}") def start_celery_worker_processes(worker_counts: tuple[int, int, int], disable_collection=False): """ @@ -223,28 +217,28 @@ def start_celery_worker_processes(worker_counts: tuple[int, int, int], disable_c sleep_time = 0 - frontend_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" + frontend_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" process_list.append(subprocess.Popen(frontend_worker.split(" "))) sleep_time += 6 if not disable_collection: #2 processes are always reserved as a baseline. - scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" + scheduling_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" process_list.append(subprocess.Popen(scheduling_worker.split(" "))) sleep_time += 6 logger.info(f"Starting core worker processes with concurrency={core_worker_count}") - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_worker_count} -n core:{uuid.uuid4().hex}@%h" + core_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency={core_worker_count} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 logger.info(f"Starting secondary worker processes with concurrency={secondary_worker_count}") - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_worker_count} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + secondary_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_worker_count} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 logger.info(f"Starting facade worker processes with concurrency={facade_worker_count}") - facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_worker_count} -n facade:{uuid.uuid4().hex}@%h -Q facade" + facade_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_worker_count} -n facade:{uuid.uuid4().hex}@%h -Q facade" process_list.append(subprocess.Popen(facade_worker.split(" "))) sleep_time += 6 @@ -261,11 +255,11 @@ def start_celery_worker_processes(worker_counts: tuple[int, int, int], disable_c @click.pass_context def stop(ctx): """ - Sends SIGTERM to all Augur server & worker processes + Sends SIGTERM to all CollectOSS server & worker processes """ - logger = logging.getLogger("augur.cli") + logger = logging.getLogger("collectoss.cli") - augur_stop(signal.SIGTERM, logger, ctx.obj.engine) + stop_processes(signal.SIGTERM, logger, ctx.obj.engine) @cli.command('stop-collection-blocking') @test_connection @@ -276,7 +270,7 @@ def stop_collection(ctx): """ Stop collection tasks if they are running, block until complete """ - processes = get_augur_processes() + processes = get_backend_processes() stopped = [] @@ -322,106 +316,28 @@ def stop_collection(ctx): @click.pass_context def kill(ctx): """ - Sends SIGKILL to all Augur server & worker processes + Sends SIGKILL to all CollectOSS server & worker processes """ - logger = logging.getLogger("augur.cli") - augur_stop(signal.SIGKILL, logger, ctx.obj.engine) + logger = logging.getLogger("collectoss.cli") + stop_processes(signal.SIGKILL, logger, ctx.obj.engine) -def augur_stop(signal, logger, engine): +def stop_processes(signal, logger, engine): """ - Stops augur with the given signal, + Stops collectoss with the given signal, and cleans up collection if it was running """ - augur_processes = get_augur_processes() + backend_processes = get_backend_processes() # if celery is running, run the cleanup function - process_names = [process.name() for process in augur_processes] + process_names = [process.name() for process in backend_processes] - _broadcast_signal_to_processes(augur_processes, broadcast_signal=signal, given_logger=logger) + _broadcast_signal_to_processes(backend_processes, broadcast_signal=signal, given_logger=logger) if "celery" in process_names: cleanup_collection_status_and_rabbit(logger, engine) -def cleanup_collection_status_and_rabbit(logger, engine): - clear_redis_caches() - - connection_string = get_value("RabbitMQ", "connection_string") - - with DatabaseSession(logger, engine=engine) as session: - - clean_collection_status(session) - - clear_rabbitmq_messages(connection_string) - -def clear_redis_caches(): - """Clears the redis databases that celery and redis use.""" - - logger.info("Flushing all redis databases this instance was using") - celery_purge_command = "celery -A augur.tasks.init.celery_app.celery_app purge -f" - subprocess.call(celery_purge_command.split(" ")) - - redis_connection = get_redis_connection() - redis_connection.flushdb() - -def clear_all_message_queues(connection_string): - queues = ['celery','secondary','scheduling','facade'] - - virtual_host_string = connection_string.split("/")[-1] - - #Parse username and password with urllib - parsed = urlparse(connection_string) - - for q in queues: - curl_cmd = f"curl -i -u {parsed.username}:{parsed.password} -XDELETE http://localhost:15672/api/queues/{virtual_host_string}/{q}" - subprocess.call(curl_cmd.split(" "),stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - -def clear_rabbitmq_messages(connection_string): - #virtual_host_string = connection_string.split("/")[-1] - - logger.info("Clearing all messages from celery queue in rabbitmq") - from augur.tasks.init.celery_app import celery_app - celery_app.control.purge() - - clear_all_message_queues(connection_string) - #rabbitmq_purge_command = f"sudo rabbitmqctl purge_queue celery -p {virtual_host_string}" - #subprocess.call(rabbitmq_purge_command.split(" ")) - -#Make sure that database reflects collection status when processes are killed/stopped. -def clean_collection_status(session): - session.execute_sql(s.sql.text(""" - UPDATE augur_operations.collection_status - SET core_status='Pending',core_task_id = NULL - WHERE core_status='Collecting' AND core_data_last_collected IS NULL; - - UPDATE augur_operations.collection_status - SET core_status='Success',core_task_id = NULL - WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; - - UPDATE augur_operations.collection_status - SET secondary_status='Pending',secondary_task_id = NULL - WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; - - UPDATE augur_operations.collection_status - SET secondary_status='Success',secondary_task_id = NULL - WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; - - UPDATE augur_operations.collection_status - SET facade_status='Update', facade_task_id=NULL - WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; - - UPDATE augur_operations.collection_status - SET facade_status='Success', facade_task_id=NULL - WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; - - UPDATE augur_operations.collection_status - SET facade_status='Pending', facade_task_id=NULL - WHERE facade_status='Failed Clone' OR facade_status='Initializing'; - """)) - #TODO: write timestamp for currently running repos. - def assign_orphan_repos_to_default_user(session): query = s.sql.text(""" SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.user_repos) @@ -439,7 +355,7 @@ def export_env(config): Exports your GitHub key and database credentials """ - export_file = open(os.getenv('AUGUR_EXPORT_FILE', 'augur_export_env.sh'), 'w+') + export_file = open(os.getenv('AUGUR_EXPORT_FILE', 'collectoss_export_env.sh'), 'w+') export_file.write('#!/bin/bash') export_file.write('\n') env_file = open(os.getenv('AUGUR_ENV_FILE', 'docker_env.txt'), 'w+') @@ -456,11 +372,11 @@ def export_env(config): @cli.command('repo-reset') @test_connection @test_db_connection -def repo_reset(augur_app): +def repo_reset(backend_app): """ Refresh repo collection to force data collection """ - augur_app.database.execute(s.sql.text(""" + backend_app.database.execute(s.sql.text(""" UPDATE augur_operations.collection_status SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; @@ -478,22 +394,21 @@ def repo_reset(augur_app): @cli.command('processes') def processes(): """ - Outputs the name/PID of all Augur server & worker processes""" - augur_processes = get_augur_processes() - for process in augur_processes: + Outputs the name/PID of all CollectOSS server & worker processes""" + for process in get_backend_processes(): logger.info(f"Found process {process.pid} [{process.name()}] -> Parent: {process.parent().pid}") -def get_augur_processes(): - augur_processes = [] +def get_backend_processes(): + process_list = [] for process in psutil.process_iter(['cmdline', 'name', 'environ']): if process.info['cmdline'] is not None and process.info['environ'] is not None: try: if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in ''.join(process.info['cmdline'][:]).lower(): if process.pid != os.getpid(): - augur_processes.append(process) + process_list.append(process) except (KeyError, FileNotFoundError): pass - return augur_processes + return process_list def _broadcast_signal_to_processes(processes, broadcast_signal=signal.SIGTERM, given_logger=None): if given_logger is None: @@ -526,43 +441,3 @@ def raise_open_file_limit(num_files): resource.setrlimit(resource.RLIMIT_NOFILE, (num_files, current_hard)) return - -# def initialize_components(augur_app, disable_housekeeper): -# master = None -# manager = None -# broker = None -# housekeeper = None -# worker_processes = [] -# mp.set_start_method('forkserver', force=True) - -# if not disable_housekeeper: - -# manager = mp.Manager() -# broker = manager.dict() -# housekeeper = Housekeeper(broker=broker, augur_app=augur_app) - -# controller = augur_app.config.get_section('Workers') -# for worker in controller.keys(): -# if controller[worker]['switch']: -# for i in range(controller[worker]['workers']): -# logger.info("Booting {} #{}".format(worker, i + 1)) -# worker_process = mp.Process(target=worker_start, name=f"{worker}_{i}", kwargs={'worker_name': worker, 'instance_number': i, 'worker_port': controller[worker]['port']}, daemon=True) -# worker_processes.append(worker_process) -# worker_process.start() - -# augur_app.manager = manager -# augur_app.broker = broker -# augur_app.housekeeper = housekeeper - -# atexit._clear() -# atexit.register(exit, augur_app, worker_processes, master) -# return AugurGunicornApp(augur_app.gunicorn_options, augur_app=augur_app) - -# def worker_start(worker_name=None, instance_number=0, worker_port=None): -# try: -# time.sleep(30 * instance_number) -# destination = subprocess.DEVNULL -# process = subprocess.Popen("cd workers/{} && {}_start".format(worker_name,worker_name), shell=True, stdout=destination, stderr=subprocess.STDOUT) -# logger.info("{} #{} booted.".format(worker_name,instance_number+1)) -# except KeyboardInterrupt as e: -# pass diff --git a/augur/application/cli/cache.py b/collectoss/application/cli/cache.py similarity index 80% rename from augur/application/cli/cache.py rename to collectoss/application/cli/cache.py index 000341964..d814d3452 100644 --- a/augur/application/cli/cache.py +++ b/collectoss/application/cli/cache.py @@ -1,14 +1,14 @@ #SPDX-License-Identifier: MIT """ -Augur library commands redis +CollectOSS library commands redis """ import click -from augur.tasks.init.redis_connection import get_redis_connection -from augur.application.logs import AugurLogger -from augur.application.cli import test_connection, test_db_connection +from collectoss.tasks.init.redis_connection import get_redis_connection +from collectoss.application.logs import SystemLogger +from collectoss.application.cli import test_connection, test_db_connection -logger = AugurLogger("augur").get_logger() +logger = SystemLogger("collectoss").get_logger() @click.group('redis', short_help='Commands for managing redis cache') def cli(): diff --git a/augur/application/cli/collection.py b/collectoss/application/cli/collection.py similarity index 75% rename from augur/application/cli/collection.py rename to collectoss/application/cli/collection.py index 810fecf74..b1a93ce80 100644 --- a/augur/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -1,6 +1,6 @@ #SPDX-License-Identifier: MIT """ -Augur library commands for controlling the backend components +CollectOSS library commands for controlling the backend components """ import os import time @@ -14,21 +14,21 @@ import traceback import sqlalchemy as s -from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records -from augur.tasks.git.facade_tasks import clone_repos -from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler -from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler -from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model -from augur.application.db.models import UserRepo -from augur.application.db.session import DatabaseSession -from augur.application.logs import AugurLogger -from augur.application.db.lib import get_value -from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext -from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages +from collectoss.tasks.start_tasks import collection_monitor, create_collection_status_records +from collectoss.tasks.git.facade_tasks import clone_repos +from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from collectoss.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler +from collectoss.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model +from collectoss.application.db.models import UserRepo +from collectoss.application.db.session import DatabaseSession +from collectoss.application.logs import SystemLogger +from collectoss.application.db.lib import get_value +from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext +from collectoss.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages from keyman.KeyClient import KeyPublisher -logger = AugurLogger("augur", reset_logfiles=False).get_logger() +logger = SystemLogger("collectoss", reset_logfiles=False).get_logger() @click.group('server', short_help='Commands for controlling the backend API server & data collection workers') @click.pass_context @@ -42,7 +42,7 @@ def cli(ctx): @with_database @click.pass_context def start(ctx, development): - """Start Augur's backend server.""" + """Start CollectOSS's backend server.""" try: if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": @@ -90,7 +90,7 @@ def start(ctx, development): log_level = get_value("Logging", "log_level") celery_beat_process = None - celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" + celery_command = f"celery -A collectoss.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" celery_beat_process = subprocess.Popen(celery_command.split(" ")) @@ -104,10 +104,10 @@ def start(ctx, development): contributor_breadth_model.si().apply_async() - # start cloning repos when augur starts + # start cloning repos when collectoss starts clone_repos.si().apply_async() - augur_collection_monitor.si().apply_async() + collection_monitor.si().apply_async() try: @@ -145,22 +145,22 @@ def start_celery_collection_processes(worker_counts: tuple[int, int, int]): core_worker_count, secondary_worker_count, facade_worker_count = worker_counts #2 processes are always reserved as a baseline. - scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" + scheduling_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" process_list.append(subprocess.Popen(scheduling_worker.split(" "))) sleep_time += 6 logger.info(f"Starting core collection processes with concurrency={core_worker_count}") - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_worker_count} -n core:{uuid.uuid4().hex}@%h" + core_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency={core_worker_count} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 logger.info(f"Starting secondary collection processes with concurrency={secondary_worker_count}") - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_worker_count} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + secondary_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_worker_count} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 logger.info(f"Starting facade collection processes with concurrency={facade_worker_count}") - facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_worker_count} -n facade:{uuid.uuid4().hex}@%h -Q facade" + facade_worker = f"celery -A collectoss.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_worker_count} -n facade:{uuid.uuid4().hex}@%h -Q facade" process_list.append(subprocess.Popen(facade_worker.split(" "))) sleep_time += 6 @@ -175,21 +175,21 @@ def start_celery_collection_processes(worker_counts: tuple[int, int, int]): @click.pass_context def stop(ctx): """ - Sends SIGTERM to all Augur server & worker processes + Sends SIGTERM to all CollectOSS server & worker processes """ - cli_logger = logging.getLogger("augur.cli") + cli_logger = logging.getLogger("collectoss.cli") - augur_stop(signal.SIGTERM, cli_logger, ctx.obj.engine) + stop_processes(signal.SIGTERM, cli_logger, ctx.obj.engine) @cli.command('kill') @with_database @click.pass_context def kill(ctx): """ - Sends SIGKILL to all Augur server & worker processes + Sends SIGKILL to all CollectOSS server & worker processes """ - cli_logger = logging.getLogger("augur.cli") - augur_stop(signal.SIGKILL, cli_logger, ctx.obj.engine) + cli_logger = logging.getLogger("collectoss.cli") + stop_processes(signal.SIGKILL, cli_logger, ctx.obj.engine) @cli.command('repo-reset') @test_connection @@ -219,21 +219,20 @@ def repo_reset(ctx): @cli.command('processes') def processes(): """ - Outputs the name/PID of all Augur server & worker processes""" - augur_processes = get_augur_collection_processes() - for process in augur_processes: + Outputs the name/PID of all CollectOSS server & worker processes""" + for process in get_collection_processes(): logger.info(f"Found process {process.pid}") -def get_augur_collection_processes(): - augur_processes = [] +def get_collection_processes(): + process_list = [] for process in psutil.process_iter(['cmdline', 'name', 'environ']): if process.info['cmdline'] is not None and process.info['environ'] is not None: try: if is_collection_process(process): - augur_processes.append(process) + process_list.append(process) except (KeyError, FileNotFoundError): pass - return augur_processes + return process_list def is_collection_process(process): @@ -241,9 +240,9 @@ def is_collection_process(process): if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: if process.pid != os.getpid(): - if "augurbackendcollection" in command or "celery_app.celery_appbeat" in command: + if "collectossbackendcollection" in command or "celery_app.celery_appbeat" in command: return True - if "augur.tasks.init.celery_app.celery_app" in command: + if "collectoss.tasks.init.celery_app.celery_app" in command: if ("scheduling" in command or "facade" in command or @@ -255,15 +254,13 @@ def is_collection_process(process): return False -def augur_stop(stop_signal, logger_instance, engine): +def stop_processes(stop_signal, logger_instance, engine): """ - Stops augur with the given signal, + Stops collectoss with the given signal, and cleans up collection if it was running """ - augur_collection_processes = get_augur_collection_processes() - - _broadcast_signal_to_processes(augur_collection_processes, logger=logger_instance, broadcast_signal=stop_signal) + _broadcast_signal_to_processes(get_collection_processes(), logger=logger_instance, broadcast_signal=stop_signal) cleanup_after_collection_halt(logger, engine) diff --git a/augur/application/cli/config.py b/collectoss/application/cli/config.py similarity index 89% rename from augur/application/cli/config.py rename to collectoss/application/cli/config.py index 6f22ea6c8..2a9a09320 100644 --- a/augur/application/cli/config.py +++ b/collectoss/application/cli/config.py @@ -1,18 +1,18 @@ #SPDX-License-Identifier: MIT """ -Augur library script for generating a config file +CollectOSS library script for generating a config file """ import os import click import json import logging -from augur.application.db.models import Config -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig, redact_setting_value -from augur.application.cli import DatabaseContext, test_connection, test_db_connection, with_database -from augur.util.inspect_without_import import get_phase_names_without_import -ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +from collectoss.application.db.models import Config +from collectoss.application.db.session import DatabaseSession +from collectoss.application.config import SystemConfig, redact_setting_value +from collectoss.application.cli import DatabaseContext, test_connection, test_db_connection, with_database +from collectoss.util.inspect_without_import import get_phase_names_without_import +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) logger = logging.getLogger(__name__) @@ -66,7 +66,7 @@ def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redi with DatabaseSession(logger, engine=ctx.obj.engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) augmented_config = config.base_config @@ -100,7 +100,7 @@ def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redi augmented_config["Facade"]["repo_directory"] = facade_repo_directory - augmented_config["Logging"]["logs_directory"] = logs_directory or (ROOT_AUGUR_DIRECTORY + "/logs/") + augmented_config["Logging"]["logs_directory"] = logs_directory or (ROOT_PROJECT_REPO_DIRECTORY + "/logs/") config.load_config_from_dict(augmented_config) @@ -114,7 +114,7 @@ def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redi def load_config(ctx, file): with DatabaseSession(logger, engine=ctx.obj.engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) print("WARNING: This will override your current config") response = str(input("Would you like to continue: [y/N]: ")).lower() @@ -139,7 +139,7 @@ def load_config(ctx, file): def add_section(ctx, section_name, file): with DatabaseSession(logger, engine=ctx.obj.engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) if config.is_section_in_config(section_name): @@ -169,7 +169,7 @@ def add_section(ctx, section_name, file): def config_set(ctx, section, setting, value): with DatabaseSession(logger, engine=ctx.obj.engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) config.add_value(section, setting, value) print(f"{setting} in {section} section set to {redact_setting_value(section, setting, value)}") @@ -184,7 +184,7 @@ def config_set(ctx, section, setting, value): def config_get(ctx, section, setting): with DatabaseSession(logger, engine=ctx.obj.engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) if setting: config_value = config.get_value(section_name=section, setting_name=setting) @@ -232,7 +232,7 @@ def config_get_all_json(): def clear_config(ctx): with DatabaseSession(logger, ctx.obj.engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) if not config.empty(): diff --git a/augur/application/cli/db.py b/collectoss/application/cli/db.py similarity index 91% rename from augur/application/cli/db.py rename to collectoss/application/cli/db.py index 8d5408eae..fd5db52cf 100644 --- a/augur/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -3,7 +3,7 @@ from os import environ, chmod, path, getenv import logging from sys import exit -from subprocess import call +from subprocess import check_call import random import string import click @@ -13,18 +13,18 @@ import re import stat as stat_module -from augur.application.cli import ( +from collectoss.application.cli import ( test_connection, test_db_connection, with_database, DatabaseContext, ) -from augur.application.db.session import DatabaseSession +from collectoss.application.db.session import DatabaseSession from sqlalchemy import update from datetime import datetime -from augur.application.db.models import Repo -from augur.application.cli.csv_utils import ( +from collectoss.application.db.models import Repo +from collectoss.application.cli._csv_utils import ( process_repo_csv, process_repo_group_csv, ) @@ -45,14 +45,14 @@ def cli(ctx): @with_database @click.pass_context def add_repos(ctx: click.Context, filename: str) -> None: - """Add repositories to Augur's database from a CSV file. + """Add repositories to CollectOSS's database from a CSV file. The CSV file can have headers (recommended): repo_url,repo_group_id - https://github.com/chaoss/augur.git,10 + https://github.com/chaoss/collectoss.git,10 Or no headers (backward compatible - column order will be auto-detected): - https://github.com/chaoss/augur.git,10 + https://github.com/chaoss/collectoss.git,10 NOTE: The Group ID must already exist in the REPO_Groups Table. @@ -66,10 +66,10 @@ def add_repos(ctx: click.Context, filename: str) -> None: Note: If you want to add an entire GitHub organization, refer to the - command: augur db add-github-org + command: collectoss db add-github-org """ - from augur.tasks.github.util.github_task_session import GithubTaskSession - from augur.util.repo_load_controller import RepoLoadController + from collectoss.tasks.github.util.github_task_session import GithubTaskSession + from collectoss.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: controller = RepoLoadController(session) @@ -156,7 +156,7 @@ def get_repo_groups(ctx: click.Context) -> pd.DataFrame: @with_database @click.pass_context def add_repo_groups(ctx: click.Context, filename: str) -> None: - """Create new repo groups in Augur's database from a CSV file. + """Create new repo groups in CollectOSS's database from a CSV file. Args: ctx: Click context object containing the database engine @@ -247,10 +247,10 @@ def add_repo_groups(ctx: click.Context, filename: str) -> None: @click.pass_context def add_github_org(ctx, organization_name): """ - Create new repo groups in Augur's database + Create new repo groups in CollectOSS's database """ - from augur.tasks.github.util.github_task_session import GithubTaskSession - from augur.util.repo_load_controller import RepoLoadController + from collectoss.tasks.github.util.github_task_session import GithubTaskSession + from collectoss.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: controller = RepoLoadController(session) @@ -280,7 +280,7 @@ def print_db_version(): """ Get the version of the configured database """ - call(["alembic", "current"]) + check_call(["alembic", "current"]) @cli.command("upgrade-db-version") @@ -290,7 +290,7 @@ def upgrade_db_version(): """ Upgrade the configured database to the latest version """ - call(["alembic", "upgrade", "head"]) + check_call(["alembic", "upgrade", "head"]) @cli.command("check-for-upgrade") @@ -298,9 +298,9 @@ def upgrade_db_version(): @test_db_connection def check_for_upgrade(): """ - Upgrade the configured database to the latest version + Show available database migration history """ - call(["alembic", "history", "-i"]) + check_call(["alembic", "history", "-i"]) @cli.command("create-schema") @@ -310,7 +310,7 @@ def create_schema(): """ Create schema in the configured database """ - call(["alembic", "upgrade", "head"]) + check_call(["alembic", "upgrade", "head"]) def generate_key(length): @@ -323,7 +323,7 @@ def generate_key(length): @click.pass_context def generate_api_key(ctx): """ - Generate and set a new Augur API key + Generate and set a new CollectOSS API key """ key = generate_key(32) ctx.invoke(update_api_key, api_key=key) @@ -352,7 +352,7 @@ def update_api_key(ctx, api_key): with ctx.obj.engine.begin() as connection: connection.execute(update_api_key_sql, api_key=api_key) - logger.info(f"Updated Augur API key to: {api_key}") + logger.info(f"Updated CollectOSS API key to: {api_key}") @cli.command("get-api-key") @@ -371,22 +371,22 @@ def get_api_key(ctx): with ctx.obj.engine.connect() as connection: print(connection.execute(get_api_key_sql).fetchone()[0]) except TypeError: - print("No Augur API key found.") + print("No CollectOSS API key found.") @cli.command( "check-pgpass", - short_help="Check the ~/.pgpass file for Augur's database credentials", + short_help="Check the ~/.pgpass file for CollectOSS's database credentials", ) def check_pgpass(): - augur_db_env_var = getenv("AUGUR_DB") - if augur_db_env_var: + db_environment_var = getenv("AUGUR_DB") + if db_environment_var: # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ # it returns a tuple like (, , , , -Add Admin command: augur user add --admin +Add Regular user command: collectoss user add +Add Admin command: collectoss user add --admin ''' import os import click import logging -from augur.application.db.models import User -from augur.application.db.engine import DatabaseEngine +from collectoss.application.db.models import User +from collectoss.application.db.engine import DatabaseEngine from sqlalchemy.orm import sessionmaker # TODO: Update these commands to use cli DatabaseContext so this engine is cleaned up diff --git a/augur/application/config.py b/collectoss/application/config.py similarity index 96% rename from augur/application/config.py rename to collectoss/application/config.py index f46f6dc27..56e6c57ae 100644 --- a/augur/application/config.py +++ b/collectoss/application/config.py @@ -4,22 +4,22 @@ import copy from typing import List, Any, Optional import os -from augur.application.db.models import Config -from augur.application.db.util import execute_session_query, convert_type_of_value +from collectoss.application.db.models import Config +from collectoss.application.db.util import execute_session_query, convert_type_of_value from pathlib import Path import logging def get_development_flag_from_config(): from logging import getLogger - from augur.application.db.session import DatabaseSession + from collectoss.application.db.session import DatabaseSession logger = getLogger(__name__) with DatabaseSession(logger) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) - section = "Augur" + section = "CollectOSS" setting = "developer" flag = config.get_value(section, setting) @@ -34,7 +34,7 @@ def redact_setting_value(section_name, setting_name, value): return value_redacted default_config = { - "Augur": { + "CollectOSS": { "developer": 0, "version": 1 }, @@ -55,7 +55,6 @@ def redact_setting_value(section_name, setting_name, value): "rebuild_caches": 1, "run_analysis": 1, "run_facade_contributors": 1, - "facade_contributor_full_recollect": 0, "commit_messages": 1, }, "Server": { @@ -83,14 +82,17 @@ def redact_setting_value(section_name, setting_name, value): "connection_string": "redis://127.0.0.1:6379/" }, "RabbitMQ": { - "connection_string": "amqp://augur:password123@localhost:5672/augur_vhost" + "connection_string": "amqp://augur:password123@localhost:5672/collectoss_vhost" }, "Tasks": { "collection_interval": 30, "core_collection_interval_days": 15, "secondary_collection_interval_days": 10, "facade_collection_interval_days": 10, - "ml_collection_interval_days": 40 + "ml_collection_interval_days": 40, + "default_batch_size": 1000, + "github_event_batch_size": 500, + "github_message_batch_size": 20 }, "Message_Insights": { "insight_days": 30, @@ -124,16 +126,16 @@ def redact_setting_value(section_name, setting_name, value): } -class AugurConfig(): +class SystemConfig(): - from augur.application.db.session import DatabaseSession + from collectoss.application.db.session import DatabaseSession session: DatabaseSession @property def base_config(self): """Return the "base" config - either the default config or a default config with user modifications on top - This is used as a base upon which the Augur CLI injects values, such as API keys, connection strings, + This is used as a base upon which the CLI injects values, such as API keys, connection strings, and other values passed in via environment variables. This config is then modified and passed into `load_config_from_dict`. """ @@ -145,7 +147,7 @@ def base_config(self): return config def __init__(self, logger, session: DatabaseSession, config_sources: list = None): - """Create a new AugurConfig class + """Create a new SystemConfig class Args: logger (_type_): The logger instance to use for logging @@ -383,7 +385,7 @@ def remove_section(self, section_name: str) -> None: return class NotWriteableException(Exception): - """Custom Augur exception class to be used when trying to modify a config that is not writeable + """Custom exception class to be used when trying to modify a config that is not writeable """ pass @@ -646,7 +648,7 @@ def __repr__(self): class DatabaseConfig(ConfigStore): """A ConfigStore for handling JSON data """ - from augur.application.db.session import DatabaseSession + from collectoss.application.db.session import DatabaseSession def __init__(self, session: DatabaseSession, logger: logging.Logger): super().__init__(logger) @@ -663,7 +665,7 @@ def empty(self): @staticmethod def _dict_to_config_table(json_data:dict): - """Convert an augur settings dict into a mapping from table columns to values for insertion in bulk + """Convert an collectoss settings dict into a mapping from table columns to values for insertion in bulk Args: json_data (dict): The settings to convert, in the same format as the default_dict at the top of this file diff --git a/augur/application/db/__init__.py b/collectoss/application/db/__init__.py similarity index 92% rename from augur/application/db/__init__.py rename to collectoss/application/db/__init__.py index f460bee36..b10b00b44 100644 --- a/augur/application/db/__init__.py +++ b/collectoss/application/db/__init__.py @@ -2,7 +2,7 @@ from sqlalchemy.orm import sessionmaker from contextlib import contextmanager -from augur.application.db.engine import create_database_engine, get_database_string +from collectoss.application.db.engine import create_database_engine, get_database_string engine = None Session = None diff --git a/augur/application/db/data_parse.py b/collectoss/application/db/data_parse.py similarity index 86% rename from augur/application/db/data_parse.py rename to collectoss/application/db/data_parse.py index eaa99fd39..d282fe423 100644 --- a/augur/application/db/data_parse.py +++ b/collectoss/application/db/data_parse.py @@ -2,7 +2,7 @@ This file contains functions that take the api response and return only the data that the database needs """ -from augur.tasks.util.AugurUUID import GithubUUID, GitlabUUID +from collectoss.tasks.util.ContributorUUID import GithubUUID, GitlabUUID import sqlalchemy as s from typing import List @@ -44,9 +44,9 @@ def extract_needed_mr_label_data(labels: List[dict], repo_id: int, tool_source: Arguments: labels: List of dictionaries of label data - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -85,9 +85,9 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so Arguments: assignees: List of dictionaries of asignee data - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectossctoss that processed the data + tool_version: The version of the collectossctoss task that processed the data data_source: The source of the data @@ -121,9 +121,9 @@ def extract_needed_merge_request_assignee_data(assignees: List[dict], repo_id: i Arguments: assignees: List of dictionaries of asignee data - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectossctoss that processed the data + tool_version: The version of the collectossctoss task that processed the data data_source: The source of the data @@ -158,9 +158,9 @@ def extract_needed_pr_reviewer_data(reviewers: List[dict], repo_id: int, tool_so Arguments: reviewers: List of dictionaries of reviewer data - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectossctossctoss that processed the data + tool_version: The version of the collectossctossctoss task that processed the data data_source: The source of the data @@ -214,12 +214,12 @@ def extract_needed_pr_metadata(metadata_list: List[dict], repo_id: int, tool_sou return metadata_dicts -def extract_pr_review_message_ref_data(comment: dict, augur_pr_review_id, github_pr_review_id, repo_id: int, tool_version: str, data_source: str) -> dict: +def extract_pr_review_message_ref_data(comment: dict, pr_review_id, github_pr_review_id, repo_id: int, tool_version: str, data_source: str) -> dict: pr_review_comment_message_ref = { # msg_id turned up null when I removed the cast to int .. 'msg_id': comment["msg_id"], - 'pr_review_id': augur_pr_review_id, + 'pr_review_id': pr_review_id, 'pr_review_msg_url': comment['url'], 'pr_review_src_id': int(github_pr_review_id), 'pr_review_msg_src_id': int(comment['id']), @@ -357,9 +357,9 @@ def extract_needed_gitlab_issue_assignee_data(assignees: List[dict], repo_id: in Arguments: assignees: List of dictionaries of gitlab assignee data - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -423,9 +423,9 @@ def extract_needed_gitlab_issue_label_data(labels: List[dict], repo_id: int, too Arguments: labels: List of dictionaries of gitlab issue label data - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -464,9 +464,9 @@ def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: Arguments: message: Message data dict issue_id: id of the issue - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -509,9 +509,9 @@ def extract_needed_pr_data(pr, repo_id, tool_source, tool_version): Arguments: pr: PR data dict - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data Returns: @@ -580,9 +580,9 @@ def extract_needed_issue_data(issue: dict, repo_id: int, tool_source: str, tool_ Arguments: issue: Issue data dict - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: platform source @@ -649,38 +649,45 @@ def extract_needed_contributor_data(contributor, tool_source, tool_version, data if not contributor: return None - cntrb_id = GithubUUID() - cntrb_id["user"] = contributor["id"] + cntrb_id = GithubUUID() + cntrb_id["user"] = int(contributor["id"]) + + # Extract all available profile fields using .get() so we never miss or crash on + # optional keys. Email is only present when the user has made it public or when + # the request is appropriately authenticated (see GitHub API docs). + email = contributor.get('email') + + # TODO get and store an owner id contributor = { "cntrb_id": cntrb_id.to_UUID(), "cntrb_login": contributor['login'], - "cntrb_created_at": contributor['created_at'] if 'created_at' in contributor else None, - "cntrb_email": contributor['email'] if 'email' in contributor else None, - "cntrb_company": contributor['company'] if 'company' in contributor else None, - "cntrb_location": contributor['location'] if 'location' in contributor else None, + "cntrb_created_at": contributor.get('created_at'), + "cntrb_email": email, + "cntrb_company": contributor.get('company'), + "cntrb_location": contributor.get('location'), # "cntrb_type": , dont have a use for this as of now ... let it default to null - "cntrb_canonical": contributor['email'] if 'email' in contributor else None, + "cntrb_canonical": email, + "cntrb_full_name": contributor.get('name'), "gh_user_id": contributor['id'], "gh_login": str(contributor['login']), ## cast as string by SPG on 11/28/2021 due to `nan` user - "gh_url": contributor['url'], - "gh_html_url": contributor['html_url'], - "gh_node_id": contributor['node_id'], - "gh_avatar_url": contributor['avatar_url'], - "gh_gravatar_id": contributor['gravatar_id'], - "gh_followers_url": contributor['followers_url'], - "gh_following_url": contributor['following_url'], - "gh_gists_url": contributor['gists_url'], - "gh_starred_url": contributor['starred_url'], - "gh_subscriptions_url": contributor['subscriptions_url'], - "gh_organizations_url": contributor['organizations_url'], - "gh_repos_url": contributor['repos_url'], - "gh_events_url": contributor['events_url'], - "gh_received_events_url": contributor['received_events_url'], - "gh_type": contributor['type'], - "gh_site_admin": contributor['site_admin'], - "cntrb_last_used" : None if 'updated_at' not in contributor else contributor['updated_at'], - "cntrb_full_name" : None if 'name' not in contributor else contributor['name'], + "gh_url": contributor.get('url'), + "gh_html_url": contributor.get('html_url'), + "gh_node_id": contributor.get('node_id'), + "gh_avatar_url": contributor.get('avatar_url'), + "gh_gravatar_id": contributor.get('gravatar_id'), + "gh_followers_url": contributor.get('followers_url'), + "gh_following_url": contributor.get('following_url'), + "gh_gists_url": contributor.get('gists_url'), + "gh_starred_url": contributor.get('starred_url'), + "gh_subscriptions_url": contributor.get('subscriptions_url'), + "gh_organizations_url": contributor.get('organizations_url'), + "gh_repos_url": contributor.get('repos_url'), + "gh_events_url": contributor.get('events_url'), + "gh_received_events_url": contributor.get('received_events_url'), + "gh_type": contributor.get('type'), + "gh_site_admin": contributor.get('site_admin'), + "cntrb_last_used" : contributor.get('updated_at'), "tool_source": tool_source, "tool_version": tool_version, "data_source": data_source @@ -783,9 +790,9 @@ def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, t Arguments: pr: PR data dict - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data Returns: @@ -847,9 +854,9 @@ def extract_needed_issue_data_from_gitlab_issue(issue: dict, repo_id: int, tool_ Arguments: issue: Issue data dict - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: platform source @@ -896,9 +903,9 @@ def extract_gitlab_mr_event_data(event: dict, pr_id: int, platform_id: int, repo event: Event data dict pr_id: id of the pr platform_id: id of the platform - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: platform source @@ -932,9 +939,9 @@ def extract_gitlab_issue_event_data(event: dict, issue_id: int, platform_id: int event: Event data dict issue_id: id of the issue platform_id: id of the platform - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: platform source @@ -968,8 +975,8 @@ def extract_needed_mr_reviewer_data(data: List[dict], pull_request_id, tool_sour Arguments: data: List of dictionaries that contain mr reviewer data to parse pull_request_id: id of the PR - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -1004,10 +1011,10 @@ def extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, Arguments: commit: commit data dictionary - repo_id: augur id of the repository + repo_id: collectoss id for the repository pull_request_id: id of the PR - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -1035,10 +1042,10 @@ def extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool Arguments: gitlab_file_data: file data dictionary - repo_id: augur id of the repository + repo_id: collectoss id for the repository pull_request_id: id of the PR - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -1078,10 +1085,10 @@ def extract_needed_mr_metadata(mr_dict, repo_id, pull_request_id, tool_source, t Arguments: mr_dict: mr data dictionary - repo_id: augur id of the repository + repo_id: collectoss id for the repository pull_request_id: id of the PR - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -1136,9 +1143,9 @@ def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, r Arguments: message: message data dict issue_id: id of the issue - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -1166,9 +1173,9 @@ def extract_needed_gitlab_message_data(comment: dict, platform_id: int, repo_id: Arguments: comment: comment data dict - platform_id: augur id of the relevant platform - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + platform_id: collectoss id of the relevant platform + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data @@ -1197,10 +1204,10 @@ def extract_needed_gitlab_mr_message_ref_data(comment: dict, pull_request_id: in Arguments: comment: comment data dict pull_request_id: id of the PR - repo_id: augur id of the repository - platform_id: augur id of the relevant platform - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + platform_id: collectoss id of the relevant platform + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data diff --git a/augur/application/db/engine.py b/collectoss/application/db/engine.py similarity index 93% rename from augur/application/db/engine.py rename to collectoss/application/db/engine.py index 0ea2bc173..ef582dbed 100644 --- a/augur/application/db/engine.py +++ b/collectoss/application/db/engine.py @@ -7,7 +7,7 @@ from sqlalchemy import create_engine, event from sqlalchemy.engine import Engine -from augur.application.db.util import catch_operational_error +from collectoss.application.db.util import catch_operational_error def parse_database_string(db_string: str) -> tuple[str,str, str, str, str]: @@ -61,24 +61,24 @@ def get_database_string() -> str: postgres database string """ - augur_db_environment_var = os.getenv("AUGUR_DB") + db_environment_var = os.getenv("AUGUR_DB") try: current_dir = os.getcwd() except FileNotFoundError: - print("\n\nPlease run augur commands in the root directory\n\n") + print("\n\nPlease run collectoss commands in the root directory\n\n") sys.exit() db_json_file_location = current_dir + "/db.config.json" db_json_exists = os.path.exists(db_json_file_location) - if not augur_db_environment_var and not db_json_exists: + if not db_environment_var and not db_json_exists: print("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the AUGUR_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the AUGUR_DB environment then run make install") sys.exit() - if augur_db_environment_var: - return augur_db_environment_var + if db_environment_var: + return db_environment_var with open("db.config.json", 'r') as f: diff --git a/augur/application/db/lib.py b/collectoss/application/db/lib.py similarity index 93% rename from augur/application/db/lib.py rename to collectoss/application/db/lib.py index 4f106b0a7..4d719d1ac 100644 --- a/augur/application/db/lib.py +++ b/collectoss/application/db/lib.py @@ -10,11 +10,11 @@ from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup -from augur.tasks.util.collection_state import CollectionState -from augur.application.db import get_session, get_engine -from augur.application.db.util import execute_session_query, convert_type_of_value -from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts +from collectoss.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup +from collectoss.tasks.util.collection_state import CollectionState +from collectoss.application.db import get_session, get_engine +from collectoss.application.db.util import execute_session_query, convert_type_of_value +from collectoss.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts logger = logging.getLogger("db_lib") @@ -48,8 +48,29 @@ def get_value(section_name: str, setting_name: str) -> Optional[Any]: setting_dict = convert_type_of_value(setting_dict, logger) return setting_dict["value"] - - + + +def get_batch_size(task_type: str = None) -> int: + """Get batch size for a task, with fallback to default. + + Args: + task_type: Optional task type (e.g., "event", "message"). + If provided and a specific config exists for it, + that value is used. Otherwise falls back to default_batch_size. + + Returns: + Batch size integer (default: 1000) + """ + if task_type: + specific_key = f"github_{task_type}_batch_size" + value = get_value("Tasks", specific_key) + if value is not None: + return int(value) + + default_value = get_value("Tasks", "default_batch_size") + return int(default_value) if default_value is not None else 1000 + + def execute_sql(sql_text): engine = get_engine() @@ -496,19 +517,13 @@ def get_contributor_aliases_by_email(email): with get_session() as session: return session.query(ContributorsAlias).filter_by(alias_email=email).all() - -def get_unresolved_commit_emails_by_name(name): - with get_session() as session: - - return session.query(UnresolvedCommitEmail).filter_by(name=name).all() - -def get_contributors_by_full_name(full_name): +def get_unresolved_commit_emails_by_email(email): with get_session() as session: - return session.query(Contributor).filter_by(cntrb_full_name=full_name).all() - + return session.query(UnresolvedCommitEmail).filter_by(email=email).all() + def get_contributors_by_github_user_id(id): with get_session() as session: diff --git a/augur/application/db/models/__init__.py b/collectoss/application/db/models/__init__.py similarity index 91% rename from augur/application/db/models/__init__.py rename to collectoss/application/db/models/__init__.py index 06ca9cb91..bed0e4c8e 100644 --- a/augur/application/db/models/__init__.py +++ b/collectoss/application/db/models/__init__.py @@ -1,4 +1,4 @@ -from augur.application.db.models.augur_data import ( +from collectoss.application.db.models.augur_data import ( ChaossMetricStatus, ChaossUser, ContributorAffiliation, @@ -69,9 +69,9 @@ RepoClone, ) -from augur.application.db.models.spdx import ( +from collectoss.application.db.models.spdx import ( SpdxAnnotationType, - SpdxAugurRepoMap, + SpdxRepoMap, SpdxCreatorType, SpdxDocumentNamespace, SpdxFileType, @@ -95,8 +95,8 @@ SpdxIdentifier, ) -from augur.application.db.models.augur_operations import ( - AugurSetting, +from collectoss.application.db.models.augur_operations import ( + Settings, WorkerHistory, WorkerJob, WorkerOauth, diff --git a/augur/application/db/models/augur_data.py b/collectoss/application/db/models/augur_data.py similarity index 89% rename from augur/application/db/models/augur_data.py rename to collectoss/application/db/models/augur_data.py index 78c2ce715..7ea85eefc 100644 --- a/augur/application/db/models/augur_data.py +++ b/collectoss/application/db/models/augur_data.py @@ -18,6 +18,7 @@ Text, UniqueConstraint, text, + Sequence, func ) from sqlalchemy.dialects.postgresql import JSONB, TIMESTAMP, UUID @@ -31,9 +32,9 @@ import urllib.parse -from augur.application.db.models.base import Base -from augur.application.db.util import execute_session_query -from augur.application.db import get_session +from collectoss.application.db.models.base import Base +from collectoss.application.db.util import execute_session_query +from collectoss.application.db import get_session DEFAULT_REPO_GROUP_ID = 1 @@ -63,11 +64,12 @@ class ChaossMetricStatus(Base): __tablename__ = "chaoss_metric_status" __table_args__ = { "schema": "augur_data", - "comment": "This table used to track CHAOSS Metric implementations in Augur, but due to the constantly changing location of that information, it is for the moment not actively populated. ", + "comment": "This table used to track CHAOSS Metric implementations, but due to the constantly changing location of that information, it is for the moment not actively populated. ", } cms_id = Column( BigInteger, + Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.chaoss_metric_status_cms_id_seq'::regclass)" @@ -99,6 +101,7 @@ class ChaossUser(Base): chaoss_id = Column( BigInteger, + Sequence('chaoss_user_chaoss_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.chaoss_user_chaoss_id_seq'::regclass)" @@ -120,11 +123,12 @@ class ContributorAffiliation(Base): __tablename__ = "contributor_affiliations" __table_args__ = { "schema": "augur_data", - "comment": "This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and augur augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when Augur is finished counting commits using the Facade Worker. ", + "comment": "This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and collectoss augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when CollectOSS is finished counting commits using the Facade Worker. ", } ca_id = Column( BigInteger, + Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.contributor_affiliations_ca_id_seq'::regclass)" @@ -163,6 +167,7 @@ class Contributor(Base): Index("contributors_idx_cntrb_email3", "cntrb_email"), Index("cntrb_canonica-idx11", "cntrb_canonical"), Index("cntrb_login_platform_index", "cntrb_login"), + Index("gh_login", text("gh_login ASC NULLS FIRST")), # added @@ -181,9 +186,6 @@ class Contributor(Base): cntrb_id = Column( UUID(as_uuid=True), primary_key=True, - server_default=text( - "nextval('augur_data.contributors_cntrb_id_seq'::regclass)" - ), ) cntrb_login = Column( String, @@ -191,13 +193,13 @@ class Contributor(Base): ) cntrb_email = Column( String, - comment="This needs to be here for matching contributor ids, which are augur, to the commit information. ", + comment="This needs to be here for matching contributor ids to the commit information. ", ) cntrb_full_name = Column(String) cntrb_company = Column(String) cntrb_created_at = Column(TIMESTAMP(precision=0)) cntrb_type = Column( - String, comment="Present in another models. It is not currently used in Augur. " + String, comment="Present in another models. It is not currently used. " ) cntrb_fake = Column(SmallInteger, server_default=text("0")) cntrb_deleted = Column(SmallInteger, server_default=text("0")) @@ -271,7 +273,7 @@ class Contributor(Base): @classmethod def from_github(cls, contributor, tool_source, tool_version, data_source): - from augur.tasks.util.AugurUUID import GithubUUID + from collectoss.tasks.util.ContributorUUID import GithubUUID cntrb_id = GithubUUID() cntrb_id["user"] = contributor["id"] @@ -499,6 +501,7 @@ class LstmAnomalyModel(Base): model_id = Column( BigInteger, + Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.lstm_anomaly_models_model_id_seq'::regclass)" @@ -527,6 +530,7 @@ class Platform(Base): pltfrm_id = Column( BigInteger, + Sequence('platform_pltfrm_id_seq', start=25430, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.platform_pltfrm_id_seq'::regclass)"), ) @@ -550,6 +554,7 @@ class RepoGroup(Base): repo_group_id = Column( BigInteger, + Sequence('repo_groups_repo_group_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_groups_repo_group_id_seq'::regclass)" @@ -639,6 +644,7 @@ class TopicWord(Base): topic_words_id = Column( BigInteger, + Sequence('topic_words_topic_words_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.topic_words_topic_words_id_seq'::regclass)" @@ -682,6 +688,7 @@ class UnresolvedCommitEmail(Base): email_unresolved_id = Column( BigInteger, + Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" @@ -703,6 +710,7 @@ class UtilityLog(Base): id = Column( BigInteger, + Sequence('utility_log_id_seq1', start=1, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.utility_log_id_seq1'::regclass)"), ) @@ -730,12 +738,13 @@ class ContributorRepo(Base): UniqueConstraint("event_id", "tool_version"), { "schema": "augur_data", - "comment": 'Developed in Partnership with Andrew Brain. \nFrom: [\n {\n "login": "octocat",\n "id": 1,\n "node_id": "MDQ6VXNlcjE=",\n "avatar_url": "https://github.com/images/error/octocat_happy.gif",\n "gravatar_id": "",\n "url": "https://api.github.com/users/octocat",\n "html_url": "https://github.com/octocat",\n "followers_url": "https://api.github.com/users/octocat/followers",\n "following_url": "https://api.github.com/users/octocat/following{/other_user}",\n "gists_url": "https://api.github.com/users/octocat/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/octocat/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/octocat/subscriptions",\n "organizations_url": "https://api.github.com/users/octocat/orgs",\n "repos_url": "https://api.github.com/users/octocat/repos",\n "events_url": "https://api.github.com/users/octocat/events{/privacy}",\n "received_events_url": "https://api.github.com/users/octocat/received_events",\n "type": "User",\n "site_admin": false\n }\n]\n', + "comment": 'Developed in Partnership with Andrew Brain.', }, ) cntrb_repo_id = Column( BigInteger, + Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.contributor_repo_cntrb_repo_id_seq'::regclass)" @@ -771,15 +780,16 @@ class ContributorRepo(Base): class ContributorsAlias(Base): __tablename__ = "contributors_aliases" __table_args__ = ( - UniqueConstraint("alias_email"), + UniqueConstraint("cntrb_id","alias_email", name="cntrb-email-insert-unique"), { "schema": "augur_data", - "comment": "Every open source user may have more than one email used to make contributions over time. Augur selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ", + "comment": "Every open source user may have more than one email used to make contributions over time. CollectOSS selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ", }, ) cntrb_alias_id = Column( BigInteger, + Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.contributors_aliases_cntrb_alias_id_seq'::regclass)" @@ -815,6 +825,7 @@ class Repo(Base): __tablename__ = "repo" __table_args__ = ( UniqueConstraint("repo_git", name="repo_git-unique"), + UniqueConstraint("repo_src_id", name="repo_src_id_unique"), Index("forked", "forked_from"), Index("repo_idx_repo_id_repo_namex", "repo_id", "repo_name"), @@ -834,6 +845,7 @@ class Repo(Base): repo_id = Column( BigInteger, + Sequence('repo_repo_id_seq', start=25480, schema='augur_data'), primary_key=True, server_default=text("nextval('augur_data.repo_repo_id_seq'::regclass)"), ) @@ -909,7 +921,7 @@ def is_valid_github_repo(gh_session, url: str) -> bool: Returns True if repo url is valid and False if not """ - from augur.tasks.github.util.github_paginator import hit_api + from collectoss.tasks.github.util.github_paginator import hit_api REPO_ENDPOINT = "https://api.github.com/repos/{}/{}" @@ -969,7 +981,7 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: Returns: True if repo URL is valid, False otherwise """ - from augur.tasks.github.util.github_paginator import hit_api + from collectoss.tasks.github.util.github_paginator import hit_api REPO_ENDPOINT = "https://gitlab.com/api/v4/projects/{}/" @@ -1192,6 +1204,7 @@ class RepoTestCoverage(Base): repo_id = Column( ForeignKey("augur_data.repo.repo_id"), + Sequence('repo_test_coverage_repo_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_test_coverage_repo_id_seq'::regclass)" @@ -1219,11 +1232,12 @@ class RepoGroupInsight(Base): __tablename__ = "repo_group_insights" __table_args__ = { "schema": "augur_data", - "comment": 'This table is output from an analytical worker inside of Augur. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + "comment": 'This table is output from an analytical worker. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } rgi_id = Column( BigInteger, + Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_group_insights_rgi_id_seq'::regclass)" @@ -1257,6 +1271,7 @@ class RepoGroupsListServe(Base): rgls_id = Column( BigInteger, + Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_groups_list_serve_rgls_id_seq'::regclass)" @@ -1311,6 +1326,7 @@ class Commit(Base): cmt_id = Column( BigInteger, + Sequence('commits_cmt_id_seq', start=25430, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), ) @@ -1338,7 +1354,14 @@ class Commit(Base): cmt_whitespace = Column(Integer, nullable=False) cmt_filename = Column(String, nullable=False) cmt_date_attempted = Column(TIMESTAMP(precision=0), nullable=False) - cmt_ght_author_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + cmt_ght_author_id = Column(ForeignKey( + "augur_data.contributors.cntrb_id", + name="cmt_ght_author_cntrb_id_fk", + onupdate="CASCADE", + ondelete="RESTRICT", + initially="DEFERRED", + deferrable=True + )) cmt_ght_committer_id = Column(Integer) cmt_ght_committed_at = Column(TIMESTAMP(precision=0)) cmt_committer_timestamp = Column(TIMESTAMP(True, 0)) @@ -1388,6 +1411,7 @@ class CommitMessage(Base): cmt_msg_id = Column( BigInteger, + Sequence('commits_cmt_id_seq', start=25430, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), ) @@ -1423,6 +1447,7 @@ class Issue(Base): issue_id = Column( BigInteger, + Sequence('issue_seq', start=31000, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.issue_seq'::regclass)"), ) @@ -1488,6 +1513,7 @@ class Library(Base): library_id = Column( BigInteger, + Sequence('libraries_library_id_seq', start=25430, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.libraries_library_id_seq'::regclass)"), ) @@ -1529,6 +1555,7 @@ class LstmAnomalyResult(Base): result_id = Column( BigInteger, + Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.lstm_anomaly_results_result_id_seq'::regclass)" @@ -1570,6 +1597,7 @@ class Message(Base): msg_id = Column( BigInteger, + Sequence('message_msg_id_seq', start=25430, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.message_msg_id_seq'::regclass)"), ) @@ -1639,6 +1667,7 @@ class MessageAnalysisSummary(Base): msg_summary_id = Column( BigInteger, + Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.message_analysis_summary_msg_summary_id_seq'::regclass)" @@ -1678,6 +1707,7 @@ class MessageSentimentSummary(Base): msg_summary_id = Column( BigInteger, + Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.message_sentiment_summary_msg_summary_id_seq'::regclass)" @@ -1714,15 +1744,17 @@ class PullRequest(Base): UniqueConstraint("repo_id", "pr_src_id", name="unique-pr"), UniqueConstraint("repo_id", "pr_src_id", name="unique-prx"), UniqueConstraint("pr_url", name="pull-request-insert-unique"), - Index("id_node", "pr_src_id", "pr_src_node_id"), + Index("id_node", text("pr_src_id DESC"), text("pr_src_node_id DESC NULLS LAST")), Index( "pull_requests_idx_repo_id_data_datex", "repo_id", "data_collection_date" ), + Index("pr_ID_prs_table", "pull_request_id"), {"schema": "augur_data"}, ) pull_request_id = Column( BigInteger, + Sequence('pull_requests_pull_request_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_requests_pull_request_id_seq'::regclass)" @@ -1742,7 +1774,7 @@ class PullRequest(Base): pr_patch_url = Column(String) pr_issue_url = Column(String) pr_augur_issue_id = Column( - BigInteger, comment="This is to link to the augur stored related issue" + BigInteger, comment="This is to link to the internal ID for the related issue" ) pr_src_number = Column( BigInteger, comment="The pr_src_number is unique within a repository." @@ -1754,7 +1786,7 @@ class PullRequest(Base): ForeignKey( "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), - comment="This is to link to the augur contributor record. ", + comment="This is to link to the contributor record. ", ) pr_body = Column(Text) pr_created_at = Column(TIMESTAMP(precision=0)) @@ -1854,7 +1886,8 @@ class Release(Base): __table_args__ = {"schema": "augur_data"} release_id = Column( - CHAR(128), + CHAR(256), + Sequence('releases_release_id_seq', start=1, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.releases_release_id_seq'::regclass)"), ) @@ -1889,6 +1922,7 @@ class RepoBadging(Base): badge_collection_id = Column( BigInteger, + Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_badging_badge_collection_id_seq'::regclass)" @@ -1930,6 +1964,7 @@ class RepoClusterMessage(Base): msg_cluster_id = Column( BigInteger, + Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" @@ -1960,6 +1995,7 @@ class RepoDependency(Base): repo_dependencies_id = Column( BigInteger, + Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_dependencies_repo_dependencies_id_seq'::regclass)" @@ -1990,6 +2026,7 @@ class RepoDepsLibyear(Base): repo_deps_libyear_id = Column( BigInteger, + Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" @@ -2018,12 +2055,13 @@ class RepoDepsLibyear(Base): class RepoDepsScorecard(Base): __tablename__ = "repo_deps_scorecard" __table_args__ = ( - UniqueConstraint("repo_id","name", name="deps-scorecard-insert-unique"), + UniqueConstraint("repo_id","name", "data_collection_date", name="deps_scorecard_new_unique"), {"schema": "augur_data"} ) repo_deps_scorecard_id = Column( BigInteger, + Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" @@ -2054,6 +2092,7 @@ class RepoInfo(Base): repo_info_id = Column( BigInteger, + Sequence('repo_info_repo_info_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_info_repo_info_id_seq'::regclass)" @@ -2105,11 +2144,12 @@ class RepoInsight(Base): __tablename__ = "repo_insights" __table_args__ = { "schema": "augur_data", - "comment": 'This table is output from an analytical worker inside of Augur. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + "comment": 'This table is output from an analytical worker. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } ri_id = Column( BigInteger, + Sequence('repo_insights_ri_id_seq', start=25430, schema="augur_data"), primary_key=True, server_default=text("nextval('augur_data.repo_insights_ri_id_seq'::regclass)"), ) @@ -2143,6 +2183,7 @@ class RepoInsightsRecord(Base): ri_id = Column( BigInteger, + Sequence('repo_insights_records_ri_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_insights_records_ri_id_seq'::regclass)" @@ -2165,13 +2206,13 @@ class RepoInsightsRecord(Base): String, comment='A confidence interval or other expression of the type of threshold and the value of a threshold met in order for it to be "an insight". Example. "95% confidence interval". ', ) - tool_source = Column(String, comment="Standard Augur Metadata") - tool_version = Column(String, comment="Standard Augur Metadata") - data_source = Column(String, comment="Standard Augur Metadata") + tool_source = Column(String, comment="Standard Collection Metadata") + tool_version = Column(String, comment="Standard Collection Metadata") + data_source = Column(String, comment="Standard Collection Metadata") data_collection_date = Column( TIMESTAMP(precision=6), server_default=text("CURRENT_TIMESTAMP"), - comment="Standard Augur Metadata", + comment="Standard Collection Metadata", ) repo = relationship("Repo") @@ -2189,6 +2230,7 @@ class RepoLabor(Base): repo_labor_id = Column( BigInteger, + Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_labor_repo_labor_id_seq'::regclass)" @@ -2226,6 +2268,7 @@ class RepoMeta(Base): ) rmeta_id = Column( BigInteger, + Sequence('repo_meta_rmeta_id_seq', start=25430, schema="augur_data"), primary_key=True, nullable=False, server_default=text("nextval('augur_data.repo_meta_rmeta_id_seq'::regclass)"), @@ -2246,6 +2289,7 @@ class RepoSbomScan(Base): rsb_id = Column( BigInteger, + Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_sbom_scans_rsb_id_seq'::regclass)" @@ -2268,6 +2312,7 @@ class RepoStat(Base): ) rstat_id = Column( BigInteger, + Sequence('repo_stats_rstat_id_seq', start=25430, schema="augur_data"), primary_key=True, nullable=False, server_default=text("nextval('augur_data.repo_stats_rstat_id_seq'::regclass)"), @@ -2288,6 +2333,7 @@ class RepoTopic(Base): repo_topic_id = Column( BigInteger, + Sequence('repo_topic_repo_topic_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_topic_repo_topic_id_seq'::regclass)" @@ -2315,6 +2361,7 @@ class CommitCommentRef(Base): cmt_comment_id = Column( BigInteger, + Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.commit_comment_ref_cmt_comment_id_seq'::regclass)" @@ -2376,6 +2423,7 @@ class CommitParent(Base): ) parent_id = Column( ForeignKey("augur_data.commits.cmt_id"), + Sequence('commit_parents_parent_id_seq', start=25430, schema='augur_data'), primary_key=True, nullable=False, server_default=text( @@ -2404,6 +2452,7 @@ class DiscourseInsight(Base): msg_discourse_id = Column( BigInteger, + Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.discourse_insights_msg_discourse_id_seq1'::regclass)" @@ -2431,6 +2480,7 @@ class IssueAssignee(Base): issue_assignee_id = Column( BigInteger, + Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_assignees_issue_assignee_id_seq'::regclass)" @@ -2490,6 +2540,7 @@ class IssueEvent(Base): event_id = Column( BigInteger, + Sequence('issue_events_event_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_events_event_id_seq'::regclass)" @@ -2574,6 +2625,7 @@ class IssueLabel(Base): issue_label_id = Column( BigInteger, + Sequence('issue_labels_issue_label_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_labels_issue_label_id_seq'::regclass)" @@ -2630,6 +2682,7 @@ class IssueMessageRef(Base): issue_msg_ref_id = Column( BigInteger, + Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" @@ -2691,6 +2744,7 @@ class LibraryDependency(Base): lib_dependency_id = Column( BigInteger, + Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.library_dependencies_lib_dependency_id_seq'::regclass)" @@ -2717,6 +2771,7 @@ class LibraryVersion(Base): library_version_id = Column( BigInteger, + Sequence('library_version_library_version_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.library_version_library_version_id_seq'::regclass)" @@ -2742,6 +2797,7 @@ class MessageAnalysis(Base): msg_analysis_id = Column( BigInteger, + Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.message_analysis_msg_analysis_id_seq'::regclass)" @@ -2784,6 +2840,7 @@ class MessageSentiment(Base): msg_analysis_id = Column( BigInteger, + Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.message_sentiment_msg_analysis_id_seq'::regclass)" @@ -2824,6 +2881,7 @@ class PullRequestAnalysis(Base): pull_request_analysis_id = Column( BigInteger, + Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" @@ -2873,6 +2931,7 @@ class PullRequestAssignee(Base): pr_assignee_map_id = Column( BigInteger, + Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" @@ -2935,6 +2994,7 @@ class PullRequestCommit(Base): pr_cmt_id = Column( BigInteger, + Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_commits_pr_cmt_id_seq'::regclass)" @@ -2981,6 +3041,7 @@ class PullRequestEvent(Base): __table_args__ = ( Index("pr_events_ibfk_1", "pull_request_id"), Index("pr_events_ibfk_2", "cntrb_id"), + UniqueConstraint("repo_id", "issue_event_src_id", name="pr_events_repo_id_event_src_id_unique"), UniqueConstraint("platform_id", "node_id", name="unique-pr-event-id"), UniqueConstraint("node_id", name="pr-unqiue-event"), {"schema": "augur_data"}, @@ -2988,6 +3049,7 @@ class PullRequestEvent(Base): pr_event_id = Column( BigInteger, + Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_events_pr_event_id_seq'::regclass)" @@ -3077,7 +3139,8 @@ def from_github(cls, event, pr_id, repo_id, tool_source, tool_version, data_sour class PullRequestFile(Base): __tablename__ = "pull_request_files" __table_args__ = ( - UniqueConstraint("pull_request_id", "repo_id", "pr_file_path"), + Index("pr_id_pr_files","pull_request_id"), + UniqueConstraint("pull_request_id", "repo_id", "pr_file_path", name="prfiles_unique"), { "schema": "augur_data", "comment": "Pull request commits are an enumeration of each commit associated with a pull request. \nNot all pull requests are from a branch or fork into master. \nThe commits table intends to count only commits that end up in the master branch (i.e., part of the deployed code base for a project).\nTherefore, there will be commit “SHA”’s in this table that are no associated with a commit SHA in the commits table. \nIn cases where the PR is to the master branch of a project, you will find a match. In cases where the PR does not involve the master branch, you will not find a corresponding commit SHA in the commits table. This is expected. ", @@ -3086,6 +3149,7 @@ class PullRequestFile(Base): pr_file_id = Column( BigInteger, + Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_files_pr_file_id_seq'::regclass)" @@ -3137,6 +3201,7 @@ class PullRequestLabel(Base): pr_label_id = Column( BigInteger, + Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_labels_pr_label_id_seq'::regclass)" @@ -3198,6 +3263,7 @@ class PullRequestMessageRef(Base): pr_msg_ref_id = Column( BigInteger, + Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" @@ -3245,11 +3311,12 @@ class PullRequestMeta(Base): Index("pr_meta-cntrbid-idx", "cntrb_id"), UniqueConstraint("pull_request_id", "pr_head_or_base", 'pr_sha', name="pull-request-meta-insert-unique"), {"schema": "augur_data", - "comment": 'Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub. The JSON Being adapted to as of the development of this schema is here: "base": { "label": "chaoss:dev", "ref": "dev", "sha": "dc6c6f3947f7dc84ecba3d8bda641ef786e7027d", "user": { "login": "chaoss", "id": 29740296, "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2", "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4", "gravatar_id": "", "url": "https://api.github.com/users/chaoss", "html_url": "https://github.com/chaoss", "followers_url": "https://api.github.com/users/chaoss/followers", "following_url": "https://api.github.com/users/chaoss/following{/other_user}", "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}", "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions", "organizations_url": "https://api.github.com/users/chaoss/orgs", "repos_url": "https://api.github.com/users/chaoss/repos", "events_url": "https://api.github.com/users/chaoss/events{/privacy}", "received_events_url": "https://api.github.com/users/chaoss/received_events", "type": "Organization", "site_admin": false }, "repo": { "id": 78134122, "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==", "name": "augur", "full_name": "chaoss/augur", "private": false, "owner": { "login": "chaoss", "id": 29740296, "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2", "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4", "gravatar_id": "", "url": "https://api.github.com/users/chaoss", "html_url": "https://github.com/chaoss", "followers_url": "https://api.github.com/users/chaoss/followers", "following_url": "https://api.github.com/users/chaoss/following{/other_user}", "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}", "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions", "organizations_url": "https://api.github.com/users/chaoss/orgs", "repos_url": "https://api.github.com/users/chaoss/repos", "events_url": "https://api.github.com/users/chaoss/events{/privacy}", "received_events_url": "https://api.github.com/users/chaoss/received_events", "type": "Organization", "site_admin": false }, '}, + "comment": 'Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub.'}, ) pr_repo_meta_id = Column( BigInteger, + Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" @@ -3277,7 +3344,7 @@ class PullRequestMeta(Base): ) pr_src_meta_label = Column( String, - comment='This is a representation of the repo:branch information in the pull request. Head is issueing the pull request and base is taking the pull request. For example: (We do not store all of this)\n\n "head": {\n "label": "chaoss:pull-request-worker",\n "ref": "pull-request-worker",\n "sha": "6b380c3d6d625616f79d702612ebab6d204614f2",\n "user": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "repo": {\n "id": 78134122,\n "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==",\n "name": "augur",\n "full_name": "chaoss/augur",\n "private": false,\n "owner": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "html_url": "https://github.com/chaoss/augur",\n "description": "Python library and web service for Open Source Software Health and Sustainability metrics & data collection.",\n "fork": false,\n "url": "https://api.github.com/repos/chaoss/augur",\n "forks_url": "https://api.github.com/repos/chaoss/augur/forks",\n "keys_url": "https://api.github.com/repos/chaoss/augur/keys{/key_id}",\n "collaborators_url": "https://api.github.com/repos/chaoss/augur/collaborators{/collaborator}",\n "teams_url": "https://api.github.com/repos/chaoss/augur/teams",\n "hooks_url": "https://api.github.com/repos/chaoss/augur/hooks",\n "issue_events_url": "https://api.github.com/repos/chaoss/augur/issues/events{/number}",\n "events_url": "https://api.github.com/repos/chaoss/augur/events",\n "assignees_url": "https://api.github.com/repos/chaoss/augur/assignees{/user}",\n "branches_url": "https://api.github.com/repos/chaoss/augur/branches{/branch}",\n "tags_url": "https://api.github.com/repos/chaoss/augur/tags",\n "blobs_url": "https://api.github.com/repos/chaoss/augur/git/blobs{/sha}",\n "git_tags_url": "https://api.github.com/repos/chaoss/augur/git/tags{/sha}",\n "git_refs_url": "https://api.github.com/repos/chaoss/augur/git/refs{/sha}",\n "trees_url": "https://api.github.com/repos/chaoss/augur/git/trees{/sha}",\n "statuses_url": "https://api.github.com/repos/chaoss/augur/statuses/{sha}",\n "languages_url": "https://api.github.com/repos/chaoss/augur/languages",\n "stargazers_url": "https://api.github.com/repos/chaoss/augur/stargazers",\n "contributors_url": "https://api.github.com/repos/chaoss/augur/contributors",\n "subscribers_url": "https://api.github.com/repos/chaoss/augur/subscribers",\n "subscription_url": "https://api.github.com/repos/chaoss/augur/subscription",\n "commits_url": "https://api.github.com/repos/chaoss/augur/commits{/sha}",\n "git_commits_url": "https://api.github.com/repos/chaoss/augur/git/commits{/sha}",\n "comments_url": "https://api.github.com/repos/chaoss/augur/comments{/number}",\n "issue_comment_url": "https://api.github.com/repos/chaoss/augur/issues/comments{/number}",\n "contents_url": "https://api.github.com/repos/chaoss/augur/contents/{+path}",\n "compare_url": "https://api.github.com/repos/chaoss/augur/compare/{base}...{head}",\n "merges_url": "https://api.github.com/repos/chaoss/augur/merges",\n "archive_url": "https://api.github.com/repos/chaoss/augur/{archive_format}{/ref}",\n "downloads_url": "https://api.github.com/repos/chaoss/augur/downloads",\n "issues_url": "https://api.github.com/repos/chaoss/augur/issues{/number}",\n "pulls_url": "https://api.github.com/repos/chaoss/augur/pulls{/number}",\n "milestones_url": "https://api.github.com/repos/chaoss/augur/milestones{/number}",\n "notifications_url": "https://api.github.com/repos/chaoss/augur/notifications{?since,all,participating}",\n "labels_url": "https://api.github.com/repos/chaoss/augur/labels{/name}",\n "releases_url": "https://api.github.com/repos/chaoss/augur/releases{/id}",\n "deployments_url": "https://api.github.com/repos/chaoss/augur/deployments",\n "created_at": "2017-01-05T17:34:54Z",\n "updated_at": "2019-11-15T00:56:12Z",\n "pushed_at": "2019-12-02T06:27:26Z",\n "git_url": "git://github.com/chaoss/augur.git",\n "ssh_url": "git@github.com:chaoss/augur.git",\n "clone_url": "https://github.com/chaoss/augur.git",\n "svn_url": "https://github.com/chaoss/augur",\n "homepage": "http://augur.osshealth.io/",\n "size": 82004,\n "stargazers_count": 153,\n "watchers_count": 153,\n "language": "Python",\n "has_issues": true,\n "has_projects": false,\n "has_downloads": true,\n "has_wiki": false,\n "has_pages": true,\n "forks_count": 205,\n "mirror_url": null,\n "archived": false,\n "disabled": false,\n "open_issues_count": 14,\n "license": {\n "key": "mit",\n "name": "MIT License",\n "spdx_id": "MIT",\n "url": "https://api.github.com/licenses/mit",\n "node_id": "MDc6TGljZW5zZTEz"\n },\n "forks": 205,\n "open_issues": 14,\n "watchers": 153,\n "default_branch": "master"\n }\n },\n "base": {\n "label": "chaoss:dev",\n "ref": "dev",\n "sha": "bfd2d34b51659613dd842cf83c3873f7699c2a0e",\n "user": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "repo": {\n "id": 78134122,\n "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==",\n "name": "augur",\n "full_name": "chaoss/augur",\n "private": false,\n "owner": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n', + comment='This is a representation of the repo:branch information in the pull request. Head is issueing the pull request and base is taking the pull request.', ) pr_src_meta_ref = Column(String) pr_sha = Column(String) @@ -3322,6 +3389,7 @@ class PullRequestReviewer(Base): pr_reviewer_map_id = Column( BigInteger, + Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" @@ -3376,12 +3444,14 @@ def from_github(cls, reviewer, repo_id, tool_source, tool_version, data_source): class PullRequestReview(Base): __tablename__ = "pull_request_reviews" __table_args__ = ( - UniqueConstraint("pr_review_src_id", "tool_source"), + UniqueConstraint("pr_review_src_id", name="pr_review_unique"), + Index("pr_id_pr_reviews", "pull_request_id"), {"schema": "augur_data"}, ) pr_review_id = Column( BigInteger, + Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_reviews_pr_review_id_seq'::regclass)" @@ -3449,6 +3519,7 @@ class PullRequestTeam(Base): pr_team_id = Column( BigInteger, + Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_teams_pr_team_id_seq'::regclass)" @@ -3492,6 +3563,7 @@ class PullRequestRepo(Base): pr_repo_id = Column( BigInteger, + Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_repo_pr_repo_id_seq'::regclass)" @@ -3534,6 +3606,7 @@ class PullRequestReviewMessageRef(Base): pr_review_msg_ref_id = Column( BigInteger, + Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" @@ -3606,6 +3679,7 @@ class RepoClone(Base): repo_clone_data_id = Column( BigInteger, + Sequence('repo_clones_data_id_seq', start=1, schema='augur_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_clones_data_id_seq'::regclass)" @@ -3721,9 +3795,9 @@ class TopicModelMeta(Base): nullable=False, comment="When training ended" ) - tool_source = Column(String, comment="Standard Augur Metadata") - tool_version = Column(String, comment="Standard Augur Metadata") - data_source = Column(String, comment="Standard Augur Metadata") + tool_source = Column(String, comment="Standard Collection Metadata") + tool_version = Column(String, comment="Standard Collection Metadata") + data_source = Column(String, comment="Standard Collection Metadata") data_collection_date = Column( TIMESTAMP(timezone=True, precision=0), server_default=text("CURRENT_TIMESTAMP") diff --git a/augur/application/db/models/augur_operations.py b/collectoss/application/db/models/augur_operations.py similarity index 95% rename from augur/application/db/models/augur_operations.py rename to collectoss/application/db/models/augur_operations.py index 12bb94e6c..41a4cef6b 100644 --- a/augur/application/db/models/augur_operations.py +++ b/collectoss/application/db/models/augur_operations.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from sqlalchemy import BigInteger, SmallInteger, Column, Index, Integer, String, Table, text, UniqueConstraint, Boolean, ForeignKey, update, CheckConstraint +from sqlalchemy import BigInteger, SmallInteger, Column, Index, Integer, String, Table, text, UniqueConstraint, Boolean, ForeignKey, update, CheckConstraint, Sequence from sqlalchemy.dialects.postgresql import TIMESTAMP from sqlalchemy.orm.exc import NoResultFound, MultipleResultsFound from sqlalchemy.exc import IntegrityError @@ -11,9 +11,9 @@ import secrets import traceback -from augur.application.db.models import Repo, RepoGroup -from augur.application.db.session import DatabaseSession -from augur.application.db.models.base import Base +from collectoss.application.db.models import Repo, RepoGroup +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db.models.base import Base FRONTEND_REPO_GROUP_NAME = "Frontend Repos" logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: Returns List of valid repo urls or empty list if invalid org """ - from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException + from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException OWNER_INFO_ENDPOINT = f"https://api.github.com/users/{owner}" ORG_REPOS_ENDPOINT = f"https://api.github.com/orgs/{owner}/repos?per_page=100" @@ -87,15 +87,16 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: ) -class AugurSetting(Base): +class Settings(Base): __tablename__ = "augur_settings" __table_args__ = { "schema": "augur_operations", - "comment": "Augur settings include the schema version, and the Augur API Key as of 10/25/2020. Future augur settings may be stored in this table, which has the basic structure of a name-value pair. ", + "comment": "CollectOSS settings include the schema version, and the CollectOSS API Key as of 10/25/2020. Future augur settings may be stored in this table, which has the basic structure of a name-value pair. ", } id = Column( BigInteger, + Sequence("augur_settings_id_seq", start=1, schema="augur_operations"), primary_key=True, server_default=text( "nextval('augur_operations.augur_settings_id_seq'::regclass)" @@ -131,6 +132,7 @@ class WorkerHistory(Base): history_id = Column( BigInteger, + Sequence("gh_worker_history_history_id_seq", start=1, schema="augur_operations"), primary_key=True, server_default=text( "nextval('augur_operations.gh_worker_history_history_id_seq'::regclass)" @@ -176,6 +178,7 @@ class WorkerOauth(Base): oauth_id = Column( BigInteger, + Sequence("worker_oauth_oauth_id_seq", start=1000, schema="augur_operations"), primary_key=True, server_default=text( "nextval('augur_operations.worker_oauth_oauth_id_seq'::regclass)" @@ -267,7 +270,7 @@ class User(Base): tool_version = Column(String) data_source = Column(String) data_collection_date = Column(TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP")) - + email_verified = Column(Boolean, server_default='false', nullable=False) groups = relationship("UserGroup", back_populates="user") tokens = relationship("UserSessionToken", back_populates="user") @@ -452,8 +455,8 @@ def remove_group(self, group_name): def add_github_repo(self, group_name, repo_url): - from augur.tasks.github.util.github_task_session import GithubTaskSession - from augur.tasks.github.util.github_api_key_handler import NoValidKeysError + from collectoss.tasks.github.util.github_task_session import GithubTaskSession + from collectoss.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: result = UserRepo.add_github_repo(session, repo_url, self.user_id, group_name) @@ -464,8 +467,8 @@ def add_github_repo(self, group_name, repo_url): def add_gitlab_repo(self, group_name, repo_url): - from augur.tasks.gitlab.gitlab_task_session import GitlabTaskSession - from augur.tasks.github.util.github_api_key_handler import NoValidKeysError + from collectoss.tasks.gitlab.gitlab_task_session import GitlabTaskSession + from collectoss.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GitlabTaskSession(logger) as session: result = UserRepo.add_gitlab_repo(session, repo_url, self.user_id, group_name) @@ -484,8 +487,8 @@ def remove_repo(self, group_name, repo_id): def add_github_org(self, group_name, org_url): - from augur.tasks.github.util.github_task_session import GithubTaskSession - from augur.tasks.github.util.github_api_key_handler import NoValidKeysError + from collectoss.tasks.github.util.github_task_session import GithubTaskSession + from collectoss.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: @@ -536,7 +539,7 @@ def sorting_function(group): def get_repos(self, page=0, page_size=25, sort="repo_id", direction="ASC", search=None): - from augur.util.repo_load_controller import RepoLoadController + from collectoss.util.repo_load_controller import RepoLoadController with DatabaseSession(logger) as session: result = RepoLoadController(session).paginate_repos("user", page, page_size, sort, direction, user=self, search=search) @@ -544,7 +547,7 @@ def get_repos(self, page=0, page_size=25, sort="repo_id", direction="ASC", searc return result def get_repo_count(self, search = None): - from augur.util.repo_load_controller import RepoLoadController + from collectoss.util.repo_load_controller import RepoLoadController with DatabaseSession(logger) as session: result = RepoLoadController(session).get_repo_count(source="user", user=self, search = search) @@ -553,7 +556,7 @@ def get_repo_count(self, search = None): def get_group_repos(self, group_name, page=0, page_size=25, sort="repo_id", direction="ASC", search=None): - from augur.util.repo_load_controller import RepoLoadController + from collectoss.util.repo_load_controller import RepoLoadController with DatabaseSession(logger) as session: result = RepoLoadController(session).paginate_repos("group", page, page_size, sort, direction, user=self, group_name=group_name, search=search) @@ -562,7 +565,7 @@ def get_group_repos(self, group_name, page=0, page_size=25, sort="repo_id", dire def get_group_repo_count(self, group_name, search = None): - from augur.util.repo_load_controller import RepoLoadController + from collectoss.util.repo_load_controller import RepoLoadController with DatabaseSession(logger) as session: result = RepoLoadController(session).get_repo_count(source="group", group_name=group_name, user=self, search=search) @@ -630,13 +633,13 @@ def compute_hashsed_password(password): class UserGroup(Base): __tablename__ = 'user_groups' __table_args__ = ( - UniqueConstraint('user_id', 'name', name='user_group_unique'), + UniqueConstraint('user_id', 'name', name='user_groups_user_id_name_key'), {"schema": "augur_operations"} ) group_id = Column(BigInteger, primary_key=True) user_id = Column(Integer, - ForeignKey("augur_operations.users.user_id", name="user_group_user_id_fkey") + ForeignKey("augur_operations.users.user_id", name="user_group_user_id_fkey"), nullable=False ) name = Column(String, nullable=False) favorited = Column(Boolean, nullable=False, server_default=text("FALSE")) @@ -1010,9 +1013,9 @@ class UserSessionToken(Base): __table_args__ = { "schema": "augur_operations" } token = Column(String, primary_key=True, nullable=False) - user_id = Column(ForeignKey("augur_operations.users.user_id", name="user_session_token_user_id_fkey")) + user_id = Column(ForeignKey("augur_operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) expiration = Column(BigInteger) - application_id = Column(ForeignKey("augur_operations.client_applications.id", name="user_session_token_application_id_fkey"), nullable=False) + application_id = Column(ForeignKey("augur_operations.client_applications.id", name="user_session_token_application_id_fkey")) created_at = Column(BigInteger) user = relationship("User", back_populates="tokens") @@ -1216,8 +1219,8 @@ class CollectionStatus(Base): @staticmethod def insert(session, logger, repo_id): - from augur.tasks.github.util.util import get_repo_weight_by_issue - from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps + from collectoss.tasks.github.util.util import get_repo_weight_by_issue + from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps repo = Repo.get_by_id(session, repo_id) repo_git = repo.repo_git diff --git a/augur/application/db/models/base.py b/collectoss/application/db/models/base.py similarity index 100% rename from augur/application/db/models/base.py rename to collectoss/application/db/models/base.py diff --git a/augur/application/db/models/spdx.py b/collectoss/application/db/models/spdx.py similarity index 87% rename from augur/application/db/models/spdx.py rename to collectoss/application/db/models/spdx.py index 4e981dc54..aa9075766 100644 --- a/augur/application/db/models/spdx.py +++ b/collectoss/application/db/models/spdx.py @@ -11,12 +11,13 @@ Text, UniqueConstraint, text, + Sequence ) from sqlalchemy.orm import relationship from sqlalchemy.dialects.postgresql import TIMESTAMP from sqlalchemy.ext.associationproxy import association_proxy -from augur.application.db.models.base import Base +from collectoss.application.db.models.base import Base metadata = Base.metadata @@ -27,22 +28,24 @@ class SpdxAnnotationType(Base): annotation_type_id = Column( Integer, - primary_key=True, + Sequence("annotation_types_annotation_type_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.annotation_types_annotation_type_id_seq'::regclass)" ), + primary_key=True ) name = Column(String(255), nullable=False, unique=True) -class SpdxAugurRepoMap(Base): +class SpdxRepoMap(Base): __tablename__ = "augur_repo_map" __table_args__ = {"schema": "spdx"} map_id = Column( Integer, - primary_key=True, + Sequence("augur_repo_map_map_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.augur_repo_map_map_id_seq'::regclass)"), + primary_key=True ) dosocs_pkg_id = Column(Integer) dosocs_pkg_name = Column(Text) @@ -56,10 +59,11 @@ class SpdxCreatorType(Base): creator_type_id = Column( Integer, - primary_key=True, + Sequence("creator_types_creator_type_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.creator_types_creator_type_id_seq'::regclass)" ), + primary_key=True ) name = Column(String(255), nullable=False) @@ -70,10 +74,11 @@ class SpdxDocumentNamespace(Base): document_namespace_id = Column( Integer, - primary_key=True, + Sequence("document_namespaces_document_namespace_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.document_namespaces_document_namespace_id_seq'::regclass)" ), + primary_key=True ) uri = Column(String(500), nullable=False, unique=True) @@ -92,8 +97,9 @@ class SpdxFile(Base): file_id = Column( Integer, - primary_key=True, + Sequence("files_file_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.files_file_id_seq'::regclass)"), + primary_key=True ) file_type_id = Column(Integer) sha256 = Column(String(64), nullable=False, unique=True) @@ -109,8 +115,9 @@ class SpdxLicense(Base): license_id = Column( Integer, - primary_key=True, + Sequence("licenses_license_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.licenses_license_id_seq'::regclass)"), + primary_key=True ) name = Column(String(255)) short_name = Column(String(255), nullable=False, unique=True) @@ -131,8 +138,9 @@ class SpdxPackage(Base): package_id = Column( Integer, - primary_key=True, + Sequence("packages_package_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.packages_package_id_seq'::regclass)"), + primary_key=True ) name = Column(String(255), nullable=False) version = Column(String(255), nullable=False) @@ -183,10 +191,11 @@ class SpdxPackagesFile(Base): package_file_id = Column( Integer, - primary_key=True, + Sequence("packages_files_package_file_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.packages_files_package_file_id_seq'::regclass)" ), + primary_key=True ) package_id = Column(ForeignKey("spdx.packages.package_id"), nullable=False) file_id = Column(ForeignKey("spdx.files.file_id"), nullable=False) @@ -207,8 +216,9 @@ class SpdxProject(Base): package_id = Column( Integer, - primary_key=True, + Sequence("projects_package_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.projects_package_id_seq'::regclass)"), + primary_key=True ) name = Column(Text, nullable=False) homepage = Column(Text, nullable=False) @@ -221,10 +231,11 @@ class SpdxRelationshipType(Base): relationship_type_id = Column( Integer, - primary_key=True, + Sequence("relationship_types_relationship_type_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.relationship_types_relationship_type_id_seq'::regclass)" ), + primary_key=True ) name = Column(String(255), nullable=False, unique=True) @@ -244,8 +255,9 @@ class SpdxScanner(Base): scanner_id = Column( Integer, - primary_key=True, + Sequence("scanners_scanner_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.scanners_scanner_id_seq'::regclass)"), + primary_key=True ) name = Column(String(255), nullable=False, unique=True) @@ -256,8 +268,9 @@ class SpdxCreator(Base): creator_id = Column( Integer, - primary_key=True, + Sequence("creators_creator_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.creators_creator_id_seq'::regclass)"), + primary_key=True ) creator_type_id = Column( ForeignKey("spdx.creator_types.creator_type_id"), nullable=False @@ -274,8 +287,9 @@ class SpdxDocument(Base): document_id = Column( Integer, - primary_key=True, + Sequence("documents_document_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.documents_document_id_seq'::regclass)"), + primary_key=True ) document_namespace_id = Column( ForeignKey("spdx.document_namespaces.document_namespace_id"), @@ -302,10 +316,11 @@ class SpdxFileContributor(Base): file_contributor_id = Column( Integer, - primary_key=True, + Sequence("file_contributors_file_contributor_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.file_contributors_file_contributor_id_seq'::regclass)" ), + primary_key=True ) file_id = Column(ForeignKey("spdx.files.file_id"), nullable=False) contributor = Column(Text, nullable=False) @@ -322,10 +337,11 @@ class SpdxFilesLicense(Base): file_license_id = Column( Integer, - primary_key=True, + Sequence("files_licenses_file_license_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.files_licenses_file_license_id_seq'::regclass)" ), + primary_key=True ) file_id = Column(ForeignKey("spdx.files.file_id"), nullable=False) license_id = Column(ForeignKey("spdx.licenses.license_id"), nullable=False) @@ -344,8 +360,9 @@ class SpdxFilesScan(Base): file_scan_id = Column( Integer, - primary_key=True, + Sequence("files_scans_file_scan_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.files_scans_file_scan_id_seq'::regclass)"), + primary_key=True ) file_id = Column(ForeignKey("spdx.files.file_id"), nullable=False) scanner_id = Column(ForeignKey("spdx.scanners.scanner_id"), nullable=False) @@ -363,10 +380,11 @@ class SpdxPackagesScan(Base): package_scan_id = Column( Integer, - primary_key=True, + Sequence("packages_scans_package_scan_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.packages_scans_package_scan_id_seq'::regclass)" ), + primary_key=True ) package_id = Column(ForeignKey("spdx.packages.package_id"), nullable=False) scanner_id = Column(ForeignKey("spdx.scanners.scanner_id"), nullable=False) @@ -381,10 +399,11 @@ class SpdxDocumentsCreator(Base): document_creator_id = Column( Integer, - primary_key=True, + Sequence("documents_creators_document_creator_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.documents_creators_document_creator_id_seq'::regclass)" ), + primary_key=True ) document_id = Column(ForeignKey("spdx.documents.document_id"), nullable=False) creator_id = Column(ForeignKey("spdx.creators.creator_id"), nullable=False) @@ -402,10 +421,11 @@ class SpdxExternalRef(Base): external_ref_id = Column( Integer, - primary_key=True, + Sequence("external_refs_external_ref_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.external_refs_external_ref_id_seq'::regclass)" ), + primary_key=True ) document_id = Column(ForeignKey("spdx.documents.document_id"), nullable=False) document_namespace_id = Column( @@ -424,8 +444,9 @@ class SpdxAnnotation(Base): annotation_id = Column( Integer, - primary_key=True, + Sequence("annotations_annotation_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.annotations_annotation_id_seq'::regclass)"), + primary_key=True ) document_id = Column(ForeignKey("spdx.documents.document_id"), nullable=False) annotation_type_id = Column( @@ -453,10 +474,11 @@ class SpdxRelationship(Base): relationship_id = Column( Integer, - primary_key=True, + Sequence("relationships_relationship_id_seq", start=1, schema="spdx"), server_default=text( "nextval('spdx.relationships_relationship_id_seq'::regclass)" ), + primary_key=True ) left_identifier_id = Column( ForeignKey("spdx.identifiers.identifier_id"), nullable=False @@ -497,8 +519,9 @@ class SpdxIdentifier(Base): identifier_id = Column( Integer, - primary_key=True, + Sequence("identifiers_identifier_id_seq", start=1, schema="spdx"), server_default=text("nextval('spdx.identifiers_identifier_id_seq'::regclass)"), + primary_key=True ) document_namespace_id = Column( ForeignKey("spdx.document_namespaces.document_namespace_id"), nullable=False diff --git a/augur/application/db/session.py b/collectoss/application/db/session.py similarity index 97% rename from augur/application/db/session.py rename to collectoss/application/db/session.py index 920b6fe6b..48cf4986b 100644 --- a/augur/application/db/session.py +++ b/collectoss/application/db/session.py @@ -8,8 +8,8 @@ from typing import Optional, List, Union from psycopg2.errors import DeadlockDetected -# from augur.tasks.util.random_key_auth import RandomKeyAuth -from augur.tasks.util.worker_util import remove_duplicates_by_uniques +# from collectoss.tasks.util.random_key_auth import RandomKeyAuth +from collectoss.tasks.util.worker_util import remove_duplicates_by_uniques def remove_null_characters_from_string(string): @@ -54,7 +54,7 @@ def __init__(self, logger, engine=None, from_msg=None, **kwargs): if self.engine is None: self.logger.debug("Passing engine will be required soon") - from augur.application.db.engine import DatabaseEngine + from collectoss.application.db.engine import DatabaseEngine self.engine_created = True diff --git a/augur/application/db/util.py b/collectoss/application/db/util.py similarity index 100% rename from augur/application/db/util.py rename to collectoss/application/db/util.py diff --git a/augur/application/log_analysis/__init__.py b/collectoss/application/log_analysis/__init__.py similarity index 100% rename from augur/application/log_analysis/__init__.py rename to collectoss/application/log_analysis/__init__.py diff --git a/augur/application/log_analysis/http/__init__.py b/collectoss/application/log_analysis/http/__init__.py similarity index 100% rename from augur/application/log_analysis/http/__init__.py rename to collectoss/application/log_analysis/http/__init__.py diff --git a/collectoss/application/log_analysis/http/empty_index.html b/collectoss/application/log_analysis/http/empty_index.html new file mode 100644 index 000000000..91b08483a --- /dev/null +++ b/collectoss/application/log_analysis/http/empty_index.html @@ -0,0 +1,96 @@ + + + + + + + +

Select a worker:

+ + + + + + + + + + + \ No newline at end of file diff --git a/augur/application/log_analysis/http/http_server.py b/collectoss/application/log_analysis/http/http_server.py similarity index 97% rename from augur/application/log_analysis/http/http_server.py rename to collectoss/application/log_analysis/http/http_server.py index 80fe4d049..efd182b71 100644 --- a/augur/application/log_analysis/http/http_server.py +++ b/collectoss/application/log_analysis/http/http_server.py @@ -19,7 +19,7 @@ def generateHTML(self, json_data): worker_name = re.search("[a-z_]{1,20}worker", log_file).group() if not worker_name: - print("Failed to get worker name, so cannot associate input data with any augur worker. Data: " + json_data) + print("Failed to get worker name, so cannot associate input data with any collectoss worker. Data: " + json_data) return if "error" not in json_data: diff --git a/collectoss/application/log_analysis/http/index.html b/collectoss/application/log_analysis/http/index.html new file mode 100644 index 000000000..91b08483a --- /dev/null +++ b/collectoss/application/log_analysis/http/index.html @@ -0,0 +1,96 @@ + + + + + + + +

Select a worker:

+ + + + + + + + + + + \ No newline at end of file diff --git a/augur/application/log_analysis/logstash-filter.conf b/collectoss/application/log_analysis/logstash-filter.conf similarity index 78% rename from augur/application/log_analysis/logstash-filter.conf rename to collectoss/application/log_analysis/logstash-filter.conf index 3e1a64db9..255ad6749 100644 --- a/augur/application/log_analysis/logstash-filter.conf +++ b/collectoss/application/log_analysis/logstash-filter.conf @@ -2,7 +2,7 @@ input { file { - path => "${ROOT_AUGUR_DIRECTORY}/logs/workers/**/*.*" + path => "${ROOT_PROJECT_REPO_DIRECTORY}/logs/workers/**/*.*" start_position => beginning # sincedb_path => "/dev/null" # ignore_older => 0 @@ -17,7 +17,7 @@ input { filter { grok { - patterns_dir => ["${ROOT_AUGUR_DIRECTORY}/log_analysis/patterns"] + patterns_dir => ["${ROOT_PROJECT_REPO_DIRECTORY}/log_analysis/patterns"] match => { "message" => "%{ERROR_MESSAGE:error}" } } date { @@ -30,7 +30,7 @@ output { # stdout { codec => rubydebug } # if [error] { # file { - # path => "${ROOT_AUGUR_DIRECTORY}/log_analysis/tmp.json" + # path => "${ROOT_PROJECT_REPO_DIRECTORY}/log_analysis/tmp.json" # create_if_deleted => true # write_behavior => append # } diff --git a/augur/application/log_analysis/patterns/error_detectors b/collectoss/application/log_analysis/patterns/error_detectors similarity index 100% rename from augur/application/log_analysis/patterns/error_detectors rename to collectoss/application/log_analysis/patterns/error_detectors diff --git a/augur/application/logs.py b/collectoss/application/logs.py similarity index 91% rename from augur/application/logs.py rename to collectoss/application/logs.py index 1fb8709b8..253482877 100644 --- a/augur/application/logs.py +++ b/collectoss/application/logs.py @@ -10,11 +10,11 @@ import coloredlogs from sqlalchemy.orm import Session -from augur.application.db.models import Config -from augur.application.config import convert_type_of_value -from augur.application.db.util import execute_session_query +from collectoss.application.db.models import Config +from collectoss.application.config import convert_type_of_value +from collectoss.application.db.util import execute_session_query -ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) SIMPLE_FORMAT_STRING = "[%(process)d] %(name)s [%(levelname)s] %(message)s" @@ -87,7 +87,7 @@ def initialize_stream_handler(logger, log_level): def get_log_config(): - from augur.application.db.engine import DatabaseEngine + from collectoss.application.db.engine import DatabaseEngine # we are using this session instead of the # DatabaseSession class because the DatabaseSession @@ -117,7 +117,7 @@ def get_log_config(): #TODO dynamically define loggers for every task names. class TaskLogConfig(): - def __init__(self, all_tasks, disable_log_files=False,reset_logfiles=False,base_log_dir=ROOT_AUGUR_DIRECTORY + "/logs/"): + def __init__(self, all_tasks, disable_log_files=False,reset_logfiles=False,base_log_dir=ROOT_PROJECT_REPO_DIRECTORY + "/logs/"): log_config = get_log_config() @@ -187,8 +187,8 @@ def getLoggerNames(self): return self.logger_names -class AugurLogger(): - def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,base_log_dir=ROOT_AUGUR_DIRECTORY + "/logs/"): +class SystemLogger(): + def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,base_log_dir=ROOT_PROJECT_REPO_DIRECTORY + "/logs/"): log_config = get_log_config() @@ -197,7 +197,7 @@ def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,bas if reset_logfiles is True: try: - print("(augur) Reseting log files") + print("(collectoss) Reseting log files") base_log_dir_path = Path(base_log_dir) for item in base_log_dir_path.iterdir(): if item.is_dir(): @@ -226,11 +226,11 @@ def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,bas #Don't bother if file logs are disabled. if not self.disable_log_files: - self.initialize_augur_logger_file_logging(self.lg) + self.initialize_file_logging(self.lg) self.lg.propagate = False - def initialize_augur_logger_file_logging(self, logger): + def initialize_file_logging(self, logger): file = str(self.base_log_dir) + "/" + str(self.logger_name) diff --git a/augur/application/schema/20210811-augur-0.2.1-release.ndm2 b/collectoss/application/schema/20210811-augur-0.2.1-release.ndm2 similarity index 100% rename from augur/application/schema/20210811-augur-0.2.1-release.ndm2 rename to collectoss/application/schema/20210811-augur-0.2.1-release.ndm2 diff --git a/augur/application/schema/20210811-augur-0.2.1-release.pdf b/collectoss/application/schema/20210811-augur-0.2.1-release.pdf similarity index 100% rename from augur/application/schema/20210811-augur-0.2.1-release.pdf rename to collectoss/application/schema/20210811-augur-0.2.1-release.pdf diff --git a/augur/application/schema/20210811-augur-0.2.1-release.png b/collectoss/application/schema/20210811-augur-0.2.1-release.png similarity index 100% rename from augur/application/schema/20210811-augur-0.2.1-release.png rename to collectoss/application/schema/20210811-augur-0.2.1-release.png diff --git a/augur/application/schema/20210811-augur-0.2.1-release.svg b/collectoss/application/schema/20210811-augur-0.2.1-release.svg similarity index 100% rename from augur/application/schema/20210811-augur-0.2.1-release.svg rename to collectoss/application/schema/20210811-augur-0.2.1-release.svg diff --git a/augur/application/schema/20211006-augur-0.21.0-release.ndm2 b/collectoss/application/schema/20211006-augur-0.21.0-release.ndm2 similarity index 100% rename from augur/application/schema/20211006-augur-0.21.0-release.ndm2 rename to collectoss/application/schema/20211006-augur-0.21.0-release.ndm2 diff --git a/augur/application/schema/__init__.py b/collectoss/application/schema/__init__.py similarity index 100% rename from augur/application/schema/__init__.py rename to collectoss/application/schema/__init__.py diff --git a/augur/application/schema/alembic/README b/collectoss/application/schema/alembic/README similarity index 100% rename from augur/application/schema/alembic/README rename to collectoss/application/schema/alembic/README diff --git a/augur/application/schema/alembic/__init__.py b/collectoss/application/schema/alembic/__init__.py similarity index 100% rename from augur/application/schema/alembic/__init__.py rename to collectoss/application/schema/alembic/__init__.py diff --git a/augur/application/schema/alembic/env.py b/collectoss/application/schema/alembic/env.py similarity index 96% rename from augur/application/schema/alembic/env.py rename to collectoss/application/schema/alembic/env.py index 3ae3afdb5..d7f160d49 100644 --- a/augur/application/schema/alembic/env.py +++ b/collectoss/application/schema/alembic/env.py @@ -2,8 +2,8 @@ from alembic import context -from augur.application.db.models.base import Base -from augur.application.db.engine import get_database_string +from collectoss.application.db.models.base import Base +from collectoss.application.db.engine import get_database_string from sqlalchemy import create_engine from dotenv import load_dotenv import re diff --git a/augur/application/schema/alembic/script.py.mako b/collectoss/application/schema/alembic/script.py.mako similarity index 100% rename from augur/application/schema/alembic/script.py.mako rename to collectoss/application/schema/alembic/script.py.mako diff --git a/augur/application/schema/alembic/versions/0_legacy.py b/collectoss/application/schema/alembic/versions/0_legacy.py similarity index 100% rename from augur/application/schema/alembic/versions/0_legacy.py rename to collectoss/application/schema/alembic/versions/0_legacy.py diff --git a/augur/application/schema/alembic/versions/10_unique_constraints_on_dependency_data.py b/collectoss/application/schema/alembic/versions/10_unique_constraints_on_dependency_data.py similarity index 100% rename from augur/application/schema/alembic/versions/10_unique_constraints_on_dependency_data.py rename to collectoss/application/schema/alembic/versions/10_unique_constraints_on_dependency_data.py diff --git a/augur/application/schema/alembic/versions/11_change_ossf_scorecard_data_to_store_json.py b/collectoss/application/schema/alembic/versions/11_change_ossf_scorecard_data_to_store_json.py similarity index 100% rename from augur/application/schema/alembic/versions/11_change_ossf_scorecard_data_to_store_json.py rename to collectoss/application/schema/alembic/versions/11_change_ossf_scorecard_data_to_store_json.py diff --git a/augur/application/schema/alembic/versions/12_traffic_additions.py b/collectoss/application/schema/alembic/versions/12_traffic_additions.py similarity index 94% rename from augur/application/schema/alembic/versions/12_traffic_additions.py rename to collectoss/application/schema/alembic/versions/12_traffic_additions.py index 2ef7b3d92..086075a9f 100644 --- a/augur/application/schema/alembic/versions/12_traffic_additions.py +++ b/collectoss/application/schema/alembic/versions/12_traffic_additions.py @@ -21,7 +21,7 @@ traffic_sequence = Sequence('repo_clones_data_id_seq', schema='augur_data') # Current Error - # File "/home/sean/github/berkeley/augur/application/schema/alembic/versions/12_traffic_additions.py", line 38, in add_repo_clone_data_table_1 + # File "/home/sean/github/berkeley/collectoss/application/schema/alembic/versions/12_traffic_additions.py", line 38, in add_repo_clone_data_table_1 # op.execute(schema.CreateSequence(traffic_sequence)) # NameError: name 'schema' is not defined diff --git a/augur/application/schema/alembic/versions/13_no_null_commit_contributors.py b/collectoss/application/schema/alembic/versions/13_no_null_commit_contributors.py similarity index 100% rename from augur/application/schema/alembic/versions/13_no_null_commit_contributors.py rename to collectoss/application/schema/alembic/versions/13_no_null_commit_contributors.py diff --git a/augur/application/schema/alembic/versions/14_commits_auto_vacuum.py b/collectoss/application/schema/alembic/versions/14_commits_auto_vacuum.py similarity index 100% rename from augur/application/schema/alembic/versions/14_commits_auto_vacuum.py rename to collectoss/application/schema/alembic/versions/14_commits_auto_vacuum.py diff --git a/augur/application/schema/alembic/versions/15_commit_performance_update.py b/collectoss/application/schema/alembic/versions/15_commit_performance_update.py similarity index 100% rename from augur/application/schema/alembic/versions/15_commit_performance_update.py rename to collectoss/application/schema/alembic/versions/15_commit_performance_update.py diff --git a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py b/collectoss/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py similarity index 100% rename from augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py rename to collectoss/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py diff --git a/augur/application/schema/alembic/versions/17_add_collection_status_constraints.py b/collectoss/application/schema/alembic/versions/17_add_collection_status_constraints.py similarity index 100% rename from augur/application/schema/alembic/versions/17_add_collection_status_constraints.py rename to collectoss/application/schema/alembic/versions/17_add_collection_status_constraints.py diff --git a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py b/collectoss/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py similarity index 94% rename from augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py rename to collectoss/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py index bbe22c724..aab90aae0 100644 --- a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py +++ b/collectoss/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py @@ -9,8 +9,8 @@ from sqlalchemy.sql import text import pathlib import shutil -from augur.application.config import AugurConfig -from augur.application.db.lib import get_value +from collectoss.application.config import SystemConfig +from collectoss.application.db.lib import get_value import logging # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/collectoss/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py similarity index 92% rename from augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py rename to collectoss/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py index 29da454ed..05aeed8dd 100644 --- a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py +++ b/collectoss/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -6,8 +6,8 @@ """ from alembic import op -from augur.application.db.session import DatabaseSession -from augur.application.config import * +from collectoss.application.db.session import DatabaseSession +from collectoss.application.config import * from sqlalchemy.sql import text import logging @@ -22,7 +22,7 @@ def upgrade(): with DatabaseSession(logger) as session: - config = AugurConfig(logger,session) + config = SystemConfig(logger,session) config_dict = config.load_config() #Update the missing fields of the celery section in the config diff --git a/augur/application/schema/alembic/versions/1_augur_new_changes.py b/collectoss/application/schema/alembic/versions/1_augur_new_changes.py similarity index 99% rename from augur/application/schema/alembic/versions/1_augur_new_changes.py rename to collectoss/application/schema/alembic/versions/1_augur_new_changes.py index 2e8440294..c53d1507b 100644 --- a/augur/application/schema/alembic/versions/1_augur_new_changes.py +++ b/collectoss/application/schema/alembic/versions/1_augur_new_changes.py @@ -9,7 +9,7 @@ import sqlalchemy as sa from sqlalchemy.dialects import postgresql from sqlalchemy.sql import text -from augur.tasks.util.AugurUUID import AugurUUID, GithubUUID, UnresolvableUUID +from collectoss.tasks.util.ContributorUUID import ContributorUUID, GithubUUID, UnresolvableUUID # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/20_add_dei_badging_table.py b/collectoss/application/schema/alembic/versions/20_add_dei_badging_table.py similarity index 100% rename from augur/application/schema/alembic/versions/20_add_dei_badging_table.py rename to collectoss/application/schema/alembic/versions/20_add_dei_badging_table.py diff --git a/augur/application/schema/alembic/versions/21_add_ml_tasks.py b/collectoss/application/schema/alembic/versions/21_add_ml_tasks.py similarity index 100% rename from augur/application/schema/alembic/versions/21_add_ml_tasks.py rename to collectoss/application/schema/alembic/versions/21_add_ml_tasks.py diff --git a/augur/application/schema/alembic/versions/22_mat_view_cntrbid.py b/collectoss/application/schema/alembic/versions/22_mat_view_cntrbid.py similarity index 100% rename from augur/application/schema/alembic/versions/22_mat_view_cntrbid.py rename to collectoss/application/schema/alembic/versions/22_mat_view_cntrbid.py diff --git a/augur/application/schema/alembic/versions/23_add_index_ghlogin.py b/collectoss/application/schema/alembic/versions/23_add_index_ghlogin.py similarity index 100% rename from augur/application/schema/alembic/versions/23_add_index_ghlogin.py rename to collectoss/application/schema/alembic/versions/23_add_index_ghlogin.py diff --git a/augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py b/collectoss/application/schema/alembic/versions/24_alter_repo_labor_unique.py similarity index 100% rename from augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py rename to collectoss/application/schema/alembic/versions/24_alter_repo_labor_unique.py diff --git a/augur/application/schema/alembic/versions/25_unique_on_mataview.py b/collectoss/application/schema/alembic/versions/25_unique_on_mataview.py similarity index 100% rename from augur/application/schema/alembic/versions/25_unique_on_mataview.py rename to collectoss/application/schema/alembic/versions/25_unique_on_mataview.py diff --git a/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py b/collectoss/application/schema/alembic/versions/26_materialized_view_unique_updates.py similarity index 100% rename from augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py rename to collectoss/application/schema/alembic/versions/26_materialized_view_unique_updates.py diff --git a/augur/application/schema/alembic/versions/27_update_messages_unique.py b/collectoss/application/schema/alembic/versions/27_update_messages_unique.py similarity index 100% rename from augur/application/schema/alembic/versions/27_update_messages_unique.py rename to collectoss/application/schema/alembic/versions/27_update_messages_unique.py diff --git a/augur/application/schema/alembic/versions/28_Performance_Indexes_a.py b/collectoss/application/schema/alembic/versions/28_Performance_Indexes_a.py similarity index 100% rename from augur/application/schema/alembic/versions/28_Performance_Indexes_a.py rename to collectoss/application/schema/alembic/versions/28_Performance_Indexes_a.py diff --git a/augur/application/schema/alembic/versions/29_add_commit_message_table.py b/collectoss/application/schema/alembic/versions/29_add_commit_message_table.py similarity index 100% rename from augur/application/schema/alembic/versions/29_add_commit_message_table.py rename to collectoss/application/schema/alembic/versions/29_add_commit_message_table.py diff --git a/augur/application/schema/alembic/versions/2_augur_new_changes.py b/collectoss/application/schema/alembic/versions/2_augur_new_changes.py similarity index 99% rename from augur/application/schema/alembic/versions/2_augur_new_changes.py rename to collectoss/application/schema/alembic/versions/2_augur_new_changes.py index af6655f31..4705a2799 100644 --- a/augur/application/schema/alembic/versions/2_augur_new_changes.py +++ b/collectoss/application/schema/alembic/versions/2_augur_new_changes.py @@ -9,7 +9,7 @@ import sqlalchemy as sa from sqlalchemy.dialects import postgresql from sqlalchemy.sql import text -from augur.tasks.util.AugurUUID import AugurUUID, GithubUUID, UnresolvableUUID +from collectoss.tasks.util.ContributorUUID import ContributorUUID, GithubUUID, UnresolvableUUID # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/30_add_repo_src_id.py b/collectoss/application/schema/alembic/versions/30_add_repo_src_id.py similarity index 100% rename from augur/application/schema/alembic/versions/30_add_repo_src_id.py rename to collectoss/application/schema/alembic/versions/30_add_repo_src_id.py diff --git a/augur/application/schema/alembic/versions/31_update_pr_events_unique.py b/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py similarity index 96% rename from augur/application/schema/alembic/versions/31_update_pr_events_unique.py rename to collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py index b55b60a09..f6aeeca20 100644 --- a/augur/application/schema/alembic/versions/31_update_pr_events_unique.py +++ b/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py @@ -8,7 +8,7 @@ from alembic import op import sqlalchemy as sa from sqlalchemy import text -from augur.application.db import create_database_engine, get_database_string +from collectoss.application.db import create_database_engine, get_database_string # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/32_update_openssf_deps.py b/collectoss/application/schema/alembic/versions/32_update_openssf_deps.py similarity index 92% rename from augur/application/schema/alembic/versions/32_update_openssf_deps.py rename to collectoss/application/schema/alembic/versions/32_update_openssf_deps.py index 50343067f..ecc494326 100644 --- a/augur/application/schema/alembic/versions/32_update_openssf_deps.py +++ b/collectoss/application/schema/alembic/versions/32_update_openssf_deps.py @@ -8,7 +8,7 @@ from alembic import op import sqlalchemy as sa from sqlalchemy import text -from augur.application.db import create_database_engine, get_database_string +from collectoss.application.db import create_database_engine, get_database_string # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/33_update_openssf_deps.py b/collectoss/application/schema/alembic/versions/33_update_openssf_deps.py similarity index 92% rename from augur/application/schema/alembic/versions/33_update_openssf_deps.py rename to collectoss/application/schema/alembic/versions/33_update_openssf_deps.py index c10d041a7..c54930839 100644 --- a/augur/application/schema/alembic/versions/33_update_openssf_deps.py +++ b/collectoss/application/schema/alembic/versions/33_update_openssf_deps.py @@ -8,7 +8,7 @@ from alembic import op import sqlalchemy as sa from sqlalchemy import text -from augur.application.db import create_database_engine, get_database_string +from collectoss.application.db import create_database_engine, get_database_string # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/34_add_contrib_to_config.py b/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py similarity index 90% rename from augur/application/schema/alembic/versions/34_add_contrib_to_config.py rename to collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py index 1a87be365..f4e17a08b 100644 --- a/augur/application/schema/alembic/versions/34_add_contrib_to_config.py +++ b/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py @@ -6,8 +6,8 @@ """ from alembic import op -from augur.application.db.session import DatabaseSession -from augur.application.config import * +from collectoss.application.db.session import DatabaseSession +from collectoss.application.config import * from sqlalchemy.sql import text import logging @@ -22,7 +22,7 @@ def upgrade(): with DatabaseSession(logger) as session: - config = AugurConfig(logger,session) + config = SystemConfig(logger,session) config_dict = config.load_config() #Update the missing fields of the facade section in the config diff --git a/augur/application/schema/alembic/versions/35_create_topic_model_meta_table.py b/collectoss/application/schema/alembic/versions/35_create_topic_model_meta_table.py similarity index 100% rename from augur/application/schema/alembic/versions/35_create_topic_model_meta_table.py rename to collectoss/application/schema/alembic/versions/35_create_topic_model_meta_table.py diff --git a/augur/application/schema/alembic/versions/36_add_topic_model_event.py b/collectoss/application/schema/alembic/versions/36_add_topic_model_event.py similarity index 100% rename from augur/application/schema/alembic/versions/36_add_topic_model_event.py rename to collectoss/application/schema/alembic/versions/36_add_topic_model_event.py diff --git a/augur/application/schema/alembic/versions/37_sync_topic_model_migrations.py b/collectoss/application/schema/alembic/versions/37_sync_topic_model_migrations.py similarity index 100% rename from augur/application/schema/alembic/versions/37_sync_topic_model_migrations.py rename to collectoss/application/schema/alembic/versions/37_sync_topic_model_migrations.py diff --git a/augur/application/schema/alembic/versions/38_add_historical_repo_urls_table.py b/collectoss/application/schema/alembic/versions/38_add_historical_repo_urls_table.py similarity index 100% rename from augur/application/schema/alembic/versions/38_add_historical_repo_urls_table.py rename to collectoss/application/schema/alembic/versions/38_add_historical_repo_urls_table.py diff --git a/collectoss/application/schema/alembic/versions/39_rename.py b/collectoss/application/schema/alembic/versions/39_rename.py new file mode 100644 index 000000000..12baa0028 --- /dev/null +++ b/collectoss/application/schema/alembic/versions/39_rename.py @@ -0,0 +1,256 @@ +"""rename + +Revision ID: 39 +Revises: 38 +Create Date: 2026-04-18 20:46:55.800268 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '39' +down_revision = '38' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table_comment( + 'chaoss_metric_status', + 'This table used to track CHAOSS Metric implementations, but due to the constantly changing location of that information, it is for the moment not actively populated. ', + existing_comment='This table used to track CHAOSS Metric implementations in Augur, but due to the constantly changing location of that information, it is for the moment not actively populated. ', + schema='augur_data' + ) + op.create_table_comment( + 'contributor_affiliations', + 'This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and collectoss augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when CollectOSS is finished counting commits using the Facade Worker. ', + existing_comment='This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and augur augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when Augur is finished counting commits using the Facade Worker. ', + schema='augur_data' + ) + op.create_table_comment( + 'contributor_repo', + 'Developed in Partnership with Andrew Brain.', + existing_comment='Developed in Partnership with Andrew Brain. \nFrom: [\n {\n "login": "octocat",\n "id": 1,\n "node_id": "MDQ6VXNlcjE=",\n "avatar_url": "https://github.com/images/error/octocat_happy.gif",\n "gravatar_id": "",\n "url": "https://api.github.com/users/octocat",\n "html_url": "https://github.com/octocat",\n "followers_url": "https://api.github.com/users/octocat/followers",\n "following_url": "https://api.github.com/users/octocat/following{/other_user}",\n "gists_url": "https://api.github.com/users/octocat/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/octocat/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/octocat/subscriptions",\n "organizations_url": "https://api.github.com/users/octocat/orgs",\n "repos_url": "https://api.github.com/users/octocat/repos",\n "events_url": "https://api.github.com/users/octocat/events{/privacy}",\n "received_events_url": "https://api.github.com/users/octocat/received_events",\n "type": "User",\n "site_admin": false\n }\n]\n', + schema='augur_data' + ) + op.alter_column('contributors', 'cntrb_email', + existing_type=sa.VARCHAR(), + comment='This needs to be here for matching contributor ids to the commit information. ', + existing_comment='This needs to be here for matching contributor ids, which are augur, to the commit information. ', + existing_nullable=True, + schema='augur_data') + op.alter_column('contributors', 'cntrb_type', + existing_type=sa.VARCHAR(), + comment='Present in another models. It is not currently used. ', + existing_comment='Present in another models. It is not currently used in Augur. ', + existing_nullable=True, + schema='augur_data') + op.create_table_comment( + 'contributors_aliases', + 'Every open source user may have more than one email used to make contributions over time. CollectOSS selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ', + existing_comment='Every open source user may have more than one email used to make contributions over time. Augur selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ', + schema='augur_data' + ) + op.alter_column('pull_request_meta', 'pr_src_meta_label', + existing_type=sa.VARCHAR(), + comment='This is a representation of the repo:branch information in the pull request. Head is issueing the pull request and base is taking the pull request.', + existing_comment='This is a representation of the repo:branch information in the pull request. Head is issueing the pull request and base is taking the pull request. For example: (We do not store all of this)\n\n "head": {\n "label": "chaoss:pull-request-worker",\n "ref": "pull-request-worker",\n "sha": "6b380c3d6d625616f79d702612ebab6d204614f2",\n "user": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "repo": {\n "id": 78134122,\n "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==",\n "name": "augur",\n "full_name": "chaoss/augur",\n "private": false,\n "owner": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "html_url": "https://github.com/chaoss/augur",\n "description": "Python library and web service for Open Source Software Health and Sustainability metrics & data collection.",\n "fork": false,\n "url": "https://api.github.com/repos/chaoss/augur",\n "forks_url": "https://api.github.com/repos/chaoss/augur/forks",\n "keys_url": "https://api.github.com/repos/chaoss/augur/keys{/key_id}",\n "collaborators_url": "https://api.github.com/repos/chaoss/augur/collaborators{/collaborator}",\n "teams_url": "https://api.github.com/repos/chaoss/augur/teams",\n "hooks_url": "https://api.github.com/repos/chaoss/augur/hooks",\n "issue_events_url": "https://api.github.com/repos/chaoss/augur/issues/events{/number}",\n "events_url": "https://api.github.com/repos/chaoss/augur/events",\n "assignees_url": "https://api.github.com/repos/chaoss/augur/assignees{/user}",\n "branches_url": "https://api.github.com/repos/chaoss/augur/branches{/branch}",\n "tags_url": "https://api.github.com/repos/chaoss/augur/tags",\n "blobs_url": "https://api.github.com/repos/chaoss/augur/git/blobs{/sha}",\n "git_tags_url": "https://api.github.com/repos/chaoss/augur/git/tags{/sha}",\n "git_refs_url": "https://api.github.com/repos/chaoss/augur/git/refs{/sha}",\n "trees_url": "https://api.github.com/repos/chaoss/augur/git/trees{/sha}",\n "statuses_url": "https://api.github.com/repos/chaoss/augur/statuses/{sha}",\n "languages_url": "https://api.github.com/repos/chaoss/augur/languages",\n "stargazers_url": "https://api.github.com/repos/chaoss/augur/stargazers",\n "contributors_url": "https://api.github.com/repos/chaoss/augur/contributors",\n "subscribers_url": "https://api.github.com/repos/chaoss/augur/subscribers",\n "subscription_url": "https://api.github.com/repos/chaoss/augur/subscription",\n "commits_url": "https://api.github.com/repos/chaoss/augur/commits{/sha}",\n "git_commits_url": "https://api.github.com/repos/chaoss/augur/git/commits{/sha}",\n "comments_url": "https://api.github.com/repos/chaoss/augur/comments{/number}",\n "issue_comment_url": "https://api.github.com/repos/chaoss/augur/issues/comments{/number}",\n "contents_url": "https://api.github.com/repos/chaoss/augur/contents/{+path}",\n "compare_url": "https://api.github.com/repos/chaoss/augur/compare/{base}...{head}",\n "merges_url": "https://api.github.com/repos/chaoss/augur/merges",\n "archive_url": "https://api.github.com/repos/chaoss/augur/{archive_format}{/ref}",\n "downloads_url": "https://api.github.com/repos/chaoss/augur/downloads",\n "issues_url": "https://api.github.com/repos/chaoss/augur/issues{/number}",\n "pulls_url": "https://api.github.com/repos/chaoss/augur/pulls{/number}",\n "milestones_url": "https://api.github.com/repos/chaoss/augur/milestones{/number}",\n "notifications_url": "https://api.github.com/repos/chaoss/augur/notifications{?since,all,participating}",\n "labels_url": "https://api.github.com/repos/chaoss/augur/labels{/name}",\n "releases_url": "https://api.github.com/repos/chaoss/augur/releases{/id}",\n "deployments_url": "https://api.github.com/repos/chaoss/augur/deployments",\n "created_at": "2017-01-05T17:34:54Z",\n "updated_at": "2019-11-15T00:56:12Z",\n "pushed_at": "2019-12-02T06:27:26Z",\n "git_url": "git://github.com/chaoss/augur.git",\n "ssh_url": "git@github.com:chaoss/augur.git",\n "clone_url": "https://github.com/chaoss/augur.git",\n "svn_url": "https://github.com/chaoss/augur",\n "homepage": "http://augur.osshealth.io/",\n "size": 82004,\n "stargazers_count": 153,\n "watchers_count": 153,\n "language": "Python",\n "has_issues": true,\n "has_projects": false,\n "has_downloads": true,\n "has_wiki": false,\n "has_pages": true,\n "forks_count": 205,\n "mirror_url": null,\n "archived": false,\n "disabled": false,\n "open_issues_count": 14,\n "license": {\n "key": "mit",\n "name": "MIT License",\n "spdx_id": "MIT",\n "url": "https://api.github.com/licenses/mit",\n "node_id": "MDc6TGljZW5zZTEz"\n },\n "forks": 205,\n "open_issues": 14,\n "watchers": 153,\n "default_branch": "master"\n }\n },\n "base": {\n "label": "chaoss:dev",\n "ref": "dev",\n "sha": "bfd2d34b51659613dd842cf83c3873f7699c2a0e",\n "user": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "repo": {\n "id": 78134122,\n "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==",\n "name": "augur",\n "full_name": "chaoss/augur",\n "private": false,\n "owner": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n', + existing_nullable=True, + schema='augur_data') + op.create_table_comment( + 'pull_request_meta', + 'Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub.', + existing_comment='Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub. The JSON Being adapted to as of the development of this schema is here: "base": { "label": "chaoss:dev", "ref": "dev", "sha": "dc6c6f3947f7dc84ecba3d8bda641ef786e7027d", "user": { "login": "chaoss", "id": 29740296, "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2", "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4", "gravatar_id": "", "url": "https://api.github.com/users/chaoss", "html_url": "https://github.com/chaoss", "followers_url": "https://api.github.com/users/chaoss/followers", "following_url": "https://api.github.com/users/chaoss/following{/other_user}", "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}", "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions", "organizations_url": "https://api.github.com/users/chaoss/orgs", "repos_url": "https://api.github.com/users/chaoss/repos", "events_url": "https://api.github.com/users/chaoss/events{/privacy}", "received_events_url": "https://api.github.com/users/chaoss/received_events", "type": "Organization", "site_admin": false }, "repo": { "id": 78134122, "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==", "name": "augur", "full_name": "chaoss/augur", "private": false, "owner": { "login": "chaoss", "id": 29740296, "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2", "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4", "gravatar_id": "", "url": "https://api.github.com/users/chaoss", "html_url": "https://github.com/chaoss", "followers_url": "https://api.github.com/users/chaoss/followers", "following_url": "https://api.github.com/users/chaoss/following{/other_user}", "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}", "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions", "organizations_url": "https://api.github.com/users/chaoss/orgs", "repos_url": "https://api.github.com/users/chaoss/repos", "events_url": "https://api.github.com/users/chaoss/events{/privacy}", "received_events_url": "https://api.github.com/users/chaoss/received_events", "type": "Organization", "site_admin": false }, ', + schema='augur_data' + ) + op.alter_column('pull_requests', 'pr_augur_issue_id', + existing_type=sa.BIGINT(), + comment='This is to link to the internal ID for the related issue', + existing_comment='This is to link to the augur stored related issue', + existing_nullable=True, + schema='augur_data') + op.alter_column('pull_requests', 'pr_augur_contributor_id', + existing_type=sa.UUID(), + comment='This is to link to the contributor record. ', + existing_comment='This is to link to the augur contributor record. ', + existing_nullable=True, + schema='augur_data') + op.create_table_comment( + 'repo_group_insights', + 'This table is output from an analytical worker. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + existing_comment='This table is output from an analytical worker inside of Augur. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + schema='augur_data' + ) + op.create_table_comment( + 'repo_insights', + 'This table is output from an analytical worker. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + existing_comment='This table is output from an analytical worker inside of Augur. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + schema='augur_data' + ) + op.alter_column('repo_insights_records', 'tool_source', + existing_type=sa.VARCHAR(), + comment='Standard Collection Metadata', + existing_comment='Standard Augur Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('repo_insights_records', 'tool_version', + existing_type=sa.VARCHAR(), + comment='Standard Collection Metadata', + existing_comment='Standard Augur Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('repo_insights_records', 'data_source', + existing_type=sa.VARCHAR(), + comment='Standard Collection Metadata', + existing_comment='Standard Augur Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('repo_insights_records', 'data_collection_date', + existing_type=postgresql.TIMESTAMP(precision=6), + comment='Standard Collection Metadata', + existing_comment='Standard Augur Metadata', + existing_nullable=True, + existing_server_default=sa.text('CURRENT_TIMESTAMP'), + schema='augur_data') + op.alter_column('topic_model_meta', 'tool_source', + existing_type=sa.VARCHAR(), + comment='Standard Collection Metadata', + existing_comment='Standard Augur Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('topic_model_meta', 'tool_version', + existing_type=sa.VARCHAR(), + comment='Standard Collection Metadata', + existing_comment='Standard Augur Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('topic_model_meta', 'data_source', + existing_type=sa.VARCHAR(), + comment='Standard Collection Metadata', + existing_comment='Standard Augur Metadata', + existing_nullable=True, + schema='augur_data') + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('topic_model_meta', 'data_source', + existing_type=sa.VARCHAR(), + comment='Standard Augur Metadata', + existing_comment='Standard Collection Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('topic_model_meta', 'tool_version', + existing_type=sa.VARCHAR(), + comment='Standard Augur Metadata', + existing_comment='Standard Collection Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('topic_model_meta', 'tool_source', + existing_type=sa.VARCHAR(), + comment='Standard Augur Metadata', + existing_comment='Standard Collection Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('repo_insights_records', 'data_collection_date', + existing_type=postgresql.TIMESTAMP(precision=6), + comment='Standard Augur Metadata', + existing_comment='Standard Collection Metadata', + existing_nullable=True, + existing_server_default=sa.text('CURRENT_TIMESTAMP'), + schema='augur_data') + op.alter_column('repo_insights_records', 'data_source', + existing_type=sa.VARCHAR(), + comment='Standard Augur Metadata', + existing_comment='Standard Collection Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('repo_insights_records', 'tool_version', + existing_type=sa.VARCHAR(), + comment='Standard Augur Metadata', + existing_comment='Standard Collection Metadata', + existing_nullable=True, + schema='augur_data') + op.alter_column('repo_insights_records', 'tool_source', + existing_type=sa.VARCHAR(), + comment='Standard Augur Metadata', + existing_comment='Standard Collection Metadata', + existing_nullable=True, + schema='augur_data') + op.create_table_comment( + 'repo_insights', + 'This table is output from an analytical worker inside of Augur. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + existing_comment='This table is output from an analytical worker. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + schema='augur_data' + ) + op.create_table_comment( + 'repo_group_insights', + 'This table is output from an analytical worker inside of Augur. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + existing_comment='This table is output from an analytical worker. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', + schema='augur_data' + ) + op.alter_column('pull_requests', 'pr_augur_contributor_id', + existing_type=sa.UUID(), + comment='This is to link to the augur contributor record. ', + existing_comment='This is to link to the contributor record. ', + existing_nullable=True, + schema='augur_data') + op.alter_column('pull_requests', 'pr_augur_issue_id', + existing_type=sa.BIGINT(), + comment='This is to link to the augur stored related issue', + existing_comment='This is to link to the internal ID for the related issue', + existing_nullable=True, + schema='augur_data') + op.create_table_comment( + 'pull_request_meta', + 'Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub. The JSON Being adapted to as of the development of this schema is here: "base": { "label": "chaoss:dev", "ref": "dev", "sha": "dc6c6f3947f7dc84ecba3d8bda641ef786e7027d", "user": { "login": "chaoss", "id": 29740296, "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2", "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4", "gravatar_id": "", "url": "https://api.github.com/users/chaoss", "html_url": "https://github.com/chaoss", "followers_url": "https://api.github.com/users/chaoss/followers", "following_url": "https://api.github.com/users/chaoss/following{/other_user}", "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}", "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions", "organizations_url": "https://api.github.com/users/chaoss/orgs", "repos_url": "https://api.github.com/users/chaoss/repos", "events_url": "https://api.github.com/users/chaoss/events{/privacy}", "received_events_url": "https://api.github.com/users/chaoss/received_events", "type": "Organization", "site_admin": false }, "repo": { "id": 78134122, "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==", "name": "augur", "full_name": "chaoss/augur", "private": false, "owner": { "login": "chaoss", "id": 29740296, "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2", "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4", "gravatar_id": "", "url": "https://api.github.com/users/chaoss", "html_url": "https://github.com/chaoss", "followers_url": "https://api.github.com/users/chaoss/followers", "following_url": "https://api.github.com/users/chaoss/following{/other_user}", "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}", "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions", "organizations_url": "https://api.github.com/users/chaoss/orgs", "repos_url": "https://api.github.com/users/chaoss/repos", "events_url": "https://api.github.com/users/chaoss/events{/privacy}", "received_events_url": "https://api.github.com/users/chaoss/received_events", "type": "Organization", "site_admin": false }, ', + existing_comment='Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub.', + schema='augur_data' + ) + op.alter_column('pull_request_meta', 'pr_src_meta_label', + existing_type=sa.VARCHAR(), + comment='This is a representation of the repo:branch information in the pull request. Head is issueing the pull request and base is taking the pull request. For example: (We do not store all of this)\n\n "head": {\n "label": "chaoss:pull-request-worker",\n "ref": "pull-request-worker",\n "sha": "6b380c3d6d625616f79d702612ebab6d204614f2",\n "user": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "repo": {\n "id": 78134122,\n "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==",\n "name": "augur",\n "full_name": "chaoss/augur",\n "private": false,\n "owner": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "html_url": "https://github.com/chaoss/augur",\n "description": "Python library and web service for Open Source Software Health and Sustainability metrics & data collection.",\n "fork": false,\n "url": "https://api.github.com/repos/chaoss/augur",\n "forks_url": "https://api.github.com/repos/chaoss/augur/forks",\n "keys_url": "https://api.github.com/repos/chaoss/augur/keys{/key_id}",\n "collaborators_url": "https://api.github.com/repos/chaoss/augur/collaborators{/collaborator}",\n "teams_url": "https://api.github.com/repos/chaoss/augur/teams",\n "hooks_url": "https://api.github.com/repos/chaoss/augur/hooks",\n "issue_events_url": "https://api.github.com/repos/chaoss/augur/issues/events{/number}",\n "events_url": "https://api.github.com/repos/chaoss/augur/events",\n "assignees_url": "https://api.github.com/repos/chaoss/augur/assignees{/user}",\n "branches_url": "https://api.github.com/repos/chaoss/augur/branches{/branch}",\n "tags_url": "https://api.github.com/repos/chaoss/augur/tags",\n "blobs_url": "https://api.github.com/repos/chaoss/augur/git/blobs{/sha}",\n "git_tags_url": "https://api.github.com/repos/chaoss/augur/git/tags{/sha}",\n "git_refs_url": "https://api.github.com/repos/chaoss/augur/git/refs{/sha}",\n "trees_url": "https://api.github.com/repos/chaoss/augur/git/trees{/sha}",\n "statuses_url": "https://api.github.com/repos/chaoss/augur/statuses/{sha}",\n "languages_url": "https://api.github.com/repos/chaoss/augur/languages",\n "stargazers_url": "https://api.github.com/repos/chaoss/augur/stargazers",\n "contributors_url": "https://api.github.com/repos/chaoss/augur/contributors",\n "subscribers_url": "https://api.github.com/repos/chaoss/augur/subscribers",\n "subscription_url": "https://api.github.com/repos/chaoss/augur/subscription",\n "commits_url": "https://api.github.com/repos/chaoss/augur/commits{/sha}",\n "git_commits_url": "https://api.github.com/repos/chaoss/augur/git/commits{/sha}",\n "comments_url": "https://api.github.com/repos/chaoss/augur/comments{/number}",\n "issue_comment_url": "https://api.github.com/repos/chaoss/augur/issues/comments{/number}",\n "contents_url": "https://api.github.com/repos/chaoss/augur/contents/{+path}",\n "compare_url": "https://api.github.com/repos/chaoss/augur/compare/{base}...{head}",\n "merges_url": "https://api.github.com/repos/chaoss/augur/merges",\n "archive_url": "https://api.github.com/repos/chaoss/augur/{archive_format}{/ref}",\n "downloads_url": "https://api.github.com/repos/chaoss/augur/downloads",\n "issues_url": "https://api.github.com/repos/chaoss/augur/issues{/number}",\n "pulls_url": "https://api.github.com/repos/chaoss/augur/pulls{/number}",\n "milestones_url": "https://api.github.com/repos/chaoss/augur/milestones{/number}",\n "notifications_url": "https://api.github.com/repos/chaoss/augur/notifications{?since,all,participating}",\n "labels_url": "https://api.github.com/repos/chaoss/augur/labels{/name}",\n "releases_url": "https://api.github.com/repos/chaoss/augur/releases{/id}",\n "deployments_url": "https://api.github.com/repos/chaoss/augur/deployments",\n "created_at": "2017-01-05T17:34:54Z",\n "updated_at": "2019-11-15T00:56:12Z",\n "pushed_at": "2019-12-02T06:27:26Z",\n "git_url": "git://github.com/chaoss/augur.git",\n "ssh_url": "git@github.com:chaoss/augur.git",\n "clone_url": "https://github.com/chaoss/augur.git",\n "svn_url": "https://github.com/chaoss/augur",\n "homepage": "http://augur.osshealth.io/",\n "size": 82004,\n "stargazers_count": 153,\n "watchers_count": 153,\n "language": "Python",\n "has_issues": true,\n "has_projects": false,\n "has_downloads": true,\n "has_wiki": false,\n "has_pages": true,\n "forks_count": 205,\n "mirror_url": null,\n "archived": false,\n "disabled": false,\n "open_issues_count": 14,\n "license": {\n "key": "mit",\n "name": "MIT License",\n "spdx_id": "MIT",\n "url": "https://api.github.com/licenses/mit",\n "node_id": "MDc6TGljZW5zZTEz"\n },\n "forks": 205,\n "open_issues": 14,\n "watchers": 153,\n "default_branch": "master"\n }\n },\n "base": {\n "label": "chaoss:dev",\n "ref": "dev",\n "sha": "bfd2d34b51659613dd842cf83c3873f7699c2a0e",\n "user": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n "repo": {\n "id": 78134122,\n "node_id": "MDEwOlJlcG9zaXRvcnk3ODEzNDEyMg==",\n "name": "augur",\n "full_name": "chaoss/augur",\n "private": false,\n "owner": {\n "login": "chaoss",\n "id": 29740296,\n "node_id": "MDEyOk9yZ2FuaXphdGlvbjI5NzQwMjk2",\n "avatar_url": "https://avatars2.githubusercontent.com/u/29740296?v=4",\n "gravatar_id": "",\n "url": "https://api.github.com/users/chaoss",\n "html_url": "https://github.com/chaoss",\n "followers_url": "https://api.github.com/users/chaoss/followers",\n "following_url": "https://api.github.com/users/chaoss/following{/other_user}",\n "gists_url": "https://api.github.com/users/chaoss/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/chaoss/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/chaoss/subscriptions",\n "organizations_url": "https://api.github.com/users/chaoss/orgs",\n "repos_url": "https://api.github.com/users/chaoss/repos",\n "events_url": "https://api.github.com/users/chaoss/events{/privacy}",\n "received_events_url": "https://api.github.com/users/chaoss/received_events",\n "type": "Organization",\n "site_admin": false\n },\n', + existing_comment='This is a representation of the repo:branch information in the pull request. Head is issueing the pull request and base is taking the pull request.', + existing_nullable=True, + schema='augur_data') + op.create_table_comment( + 'contributors_aliases', + 'Every open source user may have more than one email used to make contributions over time. Augur selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ', + existing_comment='Every open source user may have more than one email used to make contributions over time. CollectOSS selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ', + schema='augur_data' + ) + op.alter_column('contributors', 'cntrb_type', + existing_type=sa.VARCHAR(), + comment='Present in another models. It is not currently used in Augur. ', + existing_comment='Present in another models. It is not currently used. ', + existing_nullable=True, + schema='augur_data') + op.alter_column('contributors', 'cntrb_email', + existing_type=sa.VARCHAR(), + comment='This needs to be here for matching contributor ids, which are augur, to the commit information. ', + existing_comment='This needs to be here for matching contributor ids to the commit information. ', + existing_nullable=True, + schema='augur_data') + op.create_table_comment( + 'contributor_repo', + 'Developed in Partnership with Andrew Brain. \nFrom: [\n {\n "login": "octocat",\n "id": 1,\n "node_id": "MDQ6VXNlcjE=",\n "avatar_url": "https://github.com/images/error/octocat_happy.gif",\n "gravatar_id": "",\n "url": "https://api.github.com/users/octocat",\n "html_url": "https://github.com/octocat",\n "followers_url": "https://api.github.com/users/octocat/followers",\n "following_url": "https://api.github.com/users/octocat/following{/other_user}",\n "gists_url": "https://api.github.com/users/octocat/gists{/gist_id}",\n "starred_url": "https://api.github.com/users/octocat/starred{/owner}{/repo}",\n "subscriptions_url": "https://api.github.com/users/octocat/subscriptions",\n "organizations_url": "https://api.github.com/users/octocat/orgs",\n "repos_url": "https://api.github.com/users/octocat/repos",\n "events_url": "https://api.github.com/users/octocat/events{/privacy}",\n "received_events_url": "https://api.github.com/users/octocat/received_events",\n "type": "User",\n "site_admin": false\n }\n]\n', + existing_comment='Developed in Partnership with Andrew Brain.', + schema='augur_data' + ) + op.create_table_comment( + 'contributor_affiliations', + 'This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and collectoss augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when Augur is finished counting commits using the Facade Worker. ', + existing_comment='This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and augur augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when CollectOSS is finished counting commits using the Facade Worker. ', + schema='augur_data' + ) + op.create_table_comment( + 'chaoss_metric_status', + 'This table used to track CHAOSS Metric implementations in Augur, but due to the constantly changing location of that information, it is for the moment not actively populated. ', + existing_comment='This table used to track CHAOSS Metric implementations, but due to the constantly changing location of that information, it is for the moment not actively populated. ', + schema='augur_data' + ) + # ### end Alembic commands ### diff --git a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py b/collectoss/application/schema/alembic/versions/3_oauth_and_user_groups.py similarity index 99% rename from augur/application/schema/alembic/versions/3_oauth_and_user_groups.py rename to collectoss/application/schema/alembic/versions/3_oauth_and_user_groups.py index 9a1951251..6fc4b4be2 100644 --- a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py +++ b/collectoss/application/schema/alembic/versions/3_oauth_and_user_groups.py @@ -9,7 +9,7 @@ from alembic import op import sqlalchemy as sa -from augur.application.db.session import DatabaseSession +from collectoss.application.db.session import DatabaseSession CLI_USER_ID = 1 diff --git a/collectoss/application/schema/alembic/versions/40_selectively_drop_test_table.py b/collectoss/application/schema/alembic/versions/40_selectively_drop_test_table.py new file mode 100644 index 000000000..1eb3b3d9f --- /dev/null +++ b/collectoss/application/schema/alembic/versions/40_selectively_drop_test_table.py @@ -0,0 +1,34 @@ +"""selectively drop test table + +Revision ID: 40 +Revises: 39 +Create Date: 2026-02-17 16:01:29.379433 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.engine.reflection import Inspector + + +# revision identifiers, used by Alembic. +revision = '40' +down_revision = '39' +branch_labels = None +depends_on = None + + +def upgrade(): + conn = op.get_bind() + inspector = Inspector.from_engine(conn) + tables = inspector.get_table_names() + # the db init script in the container includes this table in the public schema for some reason + # databases created after that pr was merged will have this extraneous table, so we should drop it. + if "test" in tables: + op.drop_table('test') + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('test', + sa.Column('test', sa.VARCHAR(length=255), autoincrement=False, nullable=True) + ) + # ### end Alembic commands ### diff --git a/collectoss/application/schema/alembic/versions/41_fix_alias_email_constraints.py b/collectoss/application/schema/alembic/versions/41_fix_alias_email_constraints.py new file mode 100644 index 000000000..33514cab0 --- /dev/null +++ b/collectoss/application/schema/alembic/versions/41_fix_alias_email_constraints.py @@ -0,0 +1,30 @@ +"""fix alias email constraints + +Revision ID: 41 +Revises: 40 +Create Date: 2026-05-04 14:23:57.315794 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '41' +down_revision = '40' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(op.f('contributor-alias-unique'), 'contributors_aliases', schema='augur_data', type_='unique') + op.create_unique_constraint('cntrb-email-insert-unique', 'contributors_aliases', ['cntrb_id', 'alias_email'], schema='augur_data') + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint('cntrb-email-insert-unique', 'contributors_aliases', schema='augur_data', type_='unique') + op.create_unique_constraint(op.f('contributor-alias-unique'), 'contributors_aliases', ['alias_email'], schema='augur_data') + # ### end Alembic commands ### diff --git a/augur/application/schema/alembic/versions/4_explorer_materialized_view_update.py b/collectoss/application/schema/alembic/versions/4_explorer_materialized_view_update.py similarity index 99% rename from augur/application/schema/alembic/versions/4_explorer_materialized_view_update.py rename to collectoss/application/schema/alembic/versions/4_explorer_materialized_view_update.py index bbe8a1d6a..e30c2f312 100644 --- a/augur/application/schema/alembic/versions/4_explorer_materialized_view_update.py +++ b/collectoss/application/schema/alembic/versions/4_explorer_materialized_view_update.py @@ -9,7 +9,7 @@ import sqlalchemy as sa from sqlalchemy.dialects import postgresql from sqlalchemy.sql import text -from augur.tasks.util.AugurUUID import AugurUUID, GithubUUID, UnresolvableUUID +from collectoss.tasks.util.ContributorUUID import ContributorUUID, GithubUUID, UnresolvableUUID # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/5_add_collection_status_table.py b/collectoss/application/schema/alembic/versions/5_add_collection_status_table.py similarity index 100% rename from augur/application/schema/alembic/versions/5_add_collection_status_table.py rename to collectoss/application/schema/alembic/versions/5_add_collection_status_table.py diff --git a/augur/application/schema/alembic/versions/6_change_collectionstatus_table_to_keep_.py b/collectoss/application/schema/alembic/versions/6_change_collectionstatus_table_to_keep_.py similarity index 100% rename from augur/application/schema/alembic/versions/6_change_collectionstatus_table_to_keep_.py rename to collectoss/application/schema/alembic/versions/6_change_collectionstatus_table_to_keep_.py diff --git a/augur/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py b/collectoss/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py similarity index 95% rename from augur/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py rename to collectoss/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py index e2711ca19..a9b130061 100644 --- a/augur/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py +++ b/collectoss/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py @@ -7,7 +7,7 @@ """ from alembic import op from sqlalchemy.sql import text -from augur.application.db.models import Repo +from collectoss.application.db.models import Repo # revision identifiers, used by Alembic. diff --git a/augur/application/schema/alembic/versions/8_add_unique_to_pr_reviews.py b/collectoss/application/schema/alembic/versions/8_add_unique_to_pr_reviews.py similarity index 100% rename from augur/application/schema/alembic/versions/8_add_unique_to_pr_reviews.py rename to collectoss/application/schema/alembic/versions/8_add_unique_to_pr_reviews.py diff --git a/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py b/collectoss/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py similarity index 100% rename from augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py rename to collectoss/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py diff --git a/augur/application/schema/alembic/versions/__init__.py b/collectoss/application/schema/alembic/versions/__init__.py similarity index 100% rename from augur/application/schema/alembic/versions/__init__.py rename to collectoss/application/schema/alembic/versions/__init__.py diff --git a/augur/application/schema/alembic/versions/legacy/100.sql b/collectoss/application/schema/alembic/versions/legacy/100.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/100.sql rename to collectoss/application/schema/alembic/versions/legacy/100.sql diff --git a/augur/application/schema/alembic/versions/legacy/80.1-create-schema.sql b/collectoss/application/schema/alembic/versions/legacy/80.1-create-schema.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/80.1-create-schema.sql rename to collectoss/application/schema/alembic/versions/legacy/80.1-create-schema.sql diff --git a/augur/application/schema/alembic/versions/legacy/80.2-create-tables.sql b/collectoss/application/schema/alembic/versions/legacy/80.2-create-tables.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/80.2-create-tables.sql rename to collectoss/application/schema/alembic/versions/legacy/80.2-create-tables.sql diff --git a/augur/application/schema/alembic/versions/legacy/80.3-sample-data.sql b/collectoss/application/schema/alembic/versions/legacy/80.3-sample-data.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/80.3-sample-data.sql rename to collectoss/application/schema/alembic/versions/legacy/80.3-sample-data.sql diff --git a/augur/application/schema/alembic/versions/legacy/83.sql b/collectoss/application/schema/alembic/versions/legacy/83.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/83.sql rename to collectoss/application/schema/alembic/versions/legacy/83.sql diff --git a/augur/application/schema/alembic/versions/legacy/84.sql b/collectoss/application/schema/alembic/versions/legacy/84.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/84.sql rename to collectoss/application/schema/alembic/versions/legacy/84.sql diff --git a/augur/application/schema/alembic/versions/legacy/85.sql b/collectoss/application/schema/alembic/versions/legacy/85.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/85.sql rename to collectoss/application/schema/alembic/versions/legacy/85.sql diff --git a/augur/application/schema/alembic/versions/legacy/86.sql b/collectoss/application/schema/alembic/versions/legacy/86.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/86.sql rename to collectoss/application/schema/alembic/versions/legacy/86.sql diff --git a/augur/application/schema/alembic/versions/legacy/87.sql b/collectoss/application/schema/alembic/versions/legacy/87.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/87.sql rename to collectoss/application/schema/alembic/versions/legacy/87.sql diff --git a/augur/application/schema/alembic/versions/legacy/88.sql b/collectoss/application/schema/alembic/versions/legacy/88.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/88.sql rename to collectoss/application/schema/alembic/versions/legacy/88.sql diff --git a/augur/application/schema/alembic/versions/legacy/89.sql b/collectoss/application/schema/alembic/versions/legacy/89.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/89.sql rename to collectoss/application/schema/alembic/versions/legacy/89.sql diff --git a/augur/application/schema/alembic/versions/legacy/90.sql b/collectoss/application/schema/alembic/versions/legacy/90.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/90.sql rename to collectoss/application/schema/alembic/versions/legacy/90.sql diff --git a/augur/application/schema/alembic/versions/legacy/91.sql b/collectoss/application/schema/alembic/versions/legacy/91.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/91.sql rename to collectoss/application/schema/alembic/versions/legacy/91.sql diff --git a/augur/application/schema/alembic/versions/legacy/92.sql b/collectoss/application/schema/alembic/versions/legacy/92.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/92.sql rename to collectoss/application/schema/alembic/versions/legacy/92.sql diff --git a/augur/application/schema/alembic/versions/legacy/94.sql b/collectoss/application/schema/alembic/versions/legacy/94.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/94.sql rename to collectoss/application/schema/alembic/versions/legacy/94.sql diff --git a/augur/application/schema/alembic/versions/legacy/95.sql b/collectoss/application/schema/alembic/versions/legacy/95.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/95.sql rename to collectoss/application/schema/alembic/versions/legacy/95.sql diff --git a/augur/application/schema/alembic/versions/legacy/96.sql b/collectoss/application/schema/alembic/versions/legacy/96.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/96.sql rename to collectoss/application/schema/alembic/versions/legacy/96.sql diff --git a/augur/application/schema/alembic/versions/legacy/97.sql b/collectoss/application/schema/alembic/versions/legacy/97.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/97.sql rename to collectoss/application/schema/alembic/versions/legacy/97.sql diff --git a/augur/application/schema/alembic/versions/legacy/98.sql b/collectoss/application/schema/alembic/versions/legacy/98.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/98.sql rename to collectoss/application/schema/alembic/versions/legacy/98.sql diff --git a/augur/application/schema/alembic/versions/legacy/99.sql b/collectoss/application/schema/alembic/versions/legacy/99.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/99.sql rename to collectoss/application/schema/alembic/versions/legacy/99.sql diff --git a/augur/application/schema/alembic/versions/legacy/commit.sql b/collectoss/application/schema/alembic/versions/legacy/commit.sql similarity index 100% rename from augur/application/schema/alembic/versions/legacy/commit.sql rename to collectoss/application/schema/alembic/versions/legacy/commit.sql diff --git a/augur/application/schema/augur_full.sql b/collectoss/application/schema/augur_full.sql similarity index 99% rename from augur/application/schema/augur_full.sql rename to collectoss/application/schema/augur_full.sql index 6eef895cb..203707157 100644 --- a/augur/application/schema/augur_full.sql +++ b/collectoss/application/schema/augur_full.sql @@ -4691,7 +4691,7 @@ COPY augur_data.repo (repo_id, repo_group_id, repo_git, repo_path, repo_name, re 24441 10 https://github.com/operate-first/operate-first-twitter \N \N 2021-08-25 16:47:47 New \N \N \N \N \N Parent not available \N \N 0 CLI 1.0 Git 2021-08-25 16:47:47 24442 10 https://github.com/operate-first/blueprint \N \N 2021-08-25 16:47:47 New \N \N \N \N \N Parent not available \N \N 0 CLI 1.0 Git 2021-08-25 16:47:47 25445 10 https://github.com/chaoss/grimoirelab-perceval-opnfv \N \N 2020-04-17 21:40:39 New \N \N \N \N \N Parent not available \N \N 0 CLI 1.0 Git 2021-04-17 21:40:39 -1 1 https://github.com/chaoss/augur \N \N 2021-08-10 14:28:44 New \N \N \N \N \N Parent not available \N \N 0 data load one git 2021-06-05 18:41:14 +1 1 https://github.com/chaoss/collectoss \N \N 2021-08-10 14:28:44 New \N \N \N \N \N Parent not available \N \N 0 data load one git 2021-06-05 18:41:14 25430 10 https://github.com/SociallyCompute/update-test \N \N 2021-10-07 08:50:13 New \N \N \N \N \N Parent not available \N \N 0 \N \N \N \N 25450 10 https://github.com/chaoss/grimoirelab-hatstall \N \N 2021-04-17 21:40:42 New \N \N \N \N \N Parent not available \N \N 0 CLI 1.0 Git 2021-04-17 21:40:42 \. diff --git a/augur/application/schema/create_schema.sql b/collectoss/application/schema/create_schema.sql similarity index 100% rename from augur/application/schema/create_schema.sql rename to collectoss/application/schema/create_schema.sql diff --git a/augur/application/schema/queries/ecosystem-comparison-queries.md b/collectoss/application/schema/queries/ecosystem-comparison-queries.md similarity index 100% rename from augur/application/schema/queries/ecosystem-comparison-queries.md rename to collectoss/application/schema/queries/ecosystem-comparison-queries.md diff --git a/augur/application/schema/queries/machine-learning-queries.md b/collectoss/application/schema/queries/machine-learning-queries.md similarity index 100% rename from augur/application/schema/queries/machine-learning-queries.md rename to collectoss/application/schema/queries/machine-learning-queries.md diff --git a/augur/application/schema/repo_group_load_sample.csv b/collectoss/application/schema/repo_group_load_sample.csv similarity index 100% rename from augur/application/schema/repo_group_load_sample.csv rename to collectoss/application/schema/repo_group_load_sample.csv diff --git a/augur/application/schema/repo_load_sample.csv b/collectoss/application/schema/repo_load_sample.csv similarity index 88% rename from augur/application/schema/repo_load_sample.csv rename to collectoss/application/schema/repo_load_sample.csv index ee11bb5ad..4088f77d4 100644 --- a/augur/application/schema/repo_load_sample.csv +++ b/collectoss/application/schema/repo_load_sample.csv @@ -1,5 +1,5 @@ repo_url,repo_group_id -https://github.com/chaoss/augur.git,10 +https://github.com/chaoss/collectoss.git,10 https://github.com/chaoss/grimoirelab.git,10 https://github.com/chaoss/wg-evolution.git,20 https://github.com/chaoss/wg-risk.git,20 diff --git a/collectoss/application/service_manager.py b/collectoss/application/service_manager.py new file mode 100644 index 000000000..3cebb4d34 --- /dev/null +++ b/collectoss/application/service_manager.py @@ -0,0 +1,166 @@ +import sys +import os +import subprocess +import sqlalchemy as s +from collectoss.application.logs import SystemLogger +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db.lib import get_value +from collectoss.tasks.init.redis_connection import get_redis_connection +from urllib.parse import urlparse + +logger = SystemLogger("collectoss_servicemanager").get_logger() + + +class SystemServiceManager: + """ Provides a storage space for references to the various components of collectoss + This enables them to all be properly shut down in the event a shutdown signal (SIGINT - AKA ctrl-c, or SIGTERM) is received + """ + def __init__(self, ctx, pidfile, disable_collection): + self.ctx = ctx + self.pidfile = pidfile + self.disable_collection = disable_collection + self.server = None + self.processes = [] + self.celery_beat_process = None + self.keypub = None + self.shutting_down = False + + def shutdown_signal_handler(self, signum, frame): + if self.shutting_down: + return + + self.shutting_down = True + logger.info(f"Received signal {signum}, shutting down gracefully") + + # Stop server + if self.server: + logger.info("Stopping server") + self.server.terminate() + try: + self.server.wait(timeout=5) + except subprocess.TimeoutExpired: + logger.warning("Server did not terminate in time, killing") + self.server.kill() + + # Stop celery workers + logger.info("Stopping celery workers") + for p in self.processes: + if p and p.poll() is None: + p.terminate() + + # Wait for workers to terminate + for p in self.processes: + if p: + try: + p.wait(timeout=3) + except subprocess.TimeoutExpired: + logger.warning(f"Worker {p.pid} did not terminate in time, killing") + p.kill() + + # Stop celery beat + if self.celery_beat_process: + logger.info("Stopping celery beat") + self.celery_beat_process.terminate() + try: + self.celery_beat_process.wait(timeout=3) + except subprocess.TimeoutExpired: + logger.warning("Celery beat did not terminate in time, killing") + self.celery_beat_process.kill() + + # Cleanup collection resources + if not self.disable_collection: + try: + if self.keypub: + self.keypub.shutdown() + cleanup_collection_status_and_rabbit(logger, self.ctx.obj.engine) + except Exception as e: + logger.debug(f"Error during collection cleanup: {e}") + + # Remove pidfile + if os.path.exists(self.pidfile): + try: + os.unlink(self.pidfile) + except OSError as e: + logger.error(f"Could not remove pidfile {self.pidfile}: {e}") + + sys.exit(0) + +def cleanup_collection_status_and_rabbit(logger, engine): + # TODO: tech debt: this should probbaly be in a helper function but its so tightly coupled with other stuff + clear_redis_caches() + + connection_string = get_value("RabbitMQ", "connection_string") + + with DatabaseSession(logger, engine=engine) as session: + + clean_collection_status(session) + + clear_rabbitmq_messages(connection_string) + +def clear_redis_caches(): + """Clears the redis databases that celery and redis use.""" + + logger.info("Flushing all redis databases this instance was using") + celery_purge_command = "celery -A collectoss.tasks.init.celery_app.celery_app purge -f" + subprocess.call(celery_purge_command.split(" ")) + + redis_connection = get_redis_connection() + redis_connection.flushdb() + +#Make sure that database reflects collection status when processes are killed/stopped. +def clean_collection_status(session): + session.execute_sql(s.sql.text(""" + UPDATE augur_operations.collection_status + SET core_status='Pending',core_task_id = NULL + WHERE core_status='Collecting' AND core_data_last_collected IS NULL; + + UPDATE augur_operations.collection_status + SET core_status='Success',core_task_id = NULL + WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; + + UPDATE augur_operations.collection_status + SET secondary_status='Pending',secondary_task_id = NULL + WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; + + UPDATE augur_operations.collection_status + SET secondary_status='Success',secondary_task_id = NULL + WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; + + UPDATE augur_operations.collection_status + SET facade_status='Update', facade_task_id=NULL + WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; + + UPDATE augur_operations.collection_status + SET facade_status='Success', facade_task_id=NULL + WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; + + UPDATE augur_operations.collection_status + SET facade_status='Pending', facade_task_id=NULL + WHERE facade_status='Failed Clone' OR facade_status='Initializing'; + """)) + #TODO: write timestamp for currently running repos. + + +def clear_rabbitmq_messages(connection_string): + #virtual_host_string = connection_string.split("/")[-1] + + logger.info("Clearing all messages from celery queue in rabbitmq") + from collectoss.tasks.init.celery_app import celery_app + celery_app.control.purge() + + clear_all_message_queues(connection_string) + #rabbitmq_purge_command = f"sudo rabbitmqctl purge_queue celery -p {virtual_host_string}" + #subprocess.call(rabbitmq_purge_command.split(" ")) + + +def clear_all_message_queues(connection_string): + queues = ['celery','secondary','scheduling','facade'] + + virtual_host_string = connection_string.split("/")[-1] + + #Parse username and password with urllib + parsed = urlparse(connection_string) + + for q in queues: + curl_cmd = f"curl -i -u {parsed.username}:{parsed.password} -XDELETE http://localhost:15672/api/queues/{virtual_host_string}/{q}" + subprocess.call(curl_cmd.split(" "),stdout=subprocess.PIPE, stderr=subprocess.PIPE) diff --git a/augur/application/util.py b/collectoss/application/util.py similarity index 77% rename from augur/application/util.py rename to collectoss/application/util.py index af11d7d36..5c9744dce 100644 --- a/augur/application/util.py +++ b/collectoss/application/util.py @@ -1,8 +1,8 @@ import logging -from augur.application.db.session import DatabaseSession -from augur.application.db.engine import DatabaseEngine -from augur.util.repo_load_controller import RepoLoadController +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db.engine import DatabaseEngine +from collectoss.util.repo_load_controller import RepoLoadController logger = logging.getLogger(__name__) diff --git a/augur/static/css/dashboard.css b/collectoss/static/css/dashboard.css similarity index 100% rename from augur/static/css/dashboard.css rename to collectoss/static/css/dashboard.css diff --git a/augur/static/css/first_time.css b/collectoss/static/css/first_time.css similarity index 100% rename from augur/static/css/first_time.css rename to collectoss/static/css/first_time.css diff --git a/augur/static/css/stylesheet.css b/collectoss/static/css/stylesheet.css similarity index 99% rename from augur/static/css/stylesheet.css rename to collectoss/static/css/stylesheet.css index 59bbf0785..5511cb45d 100644 --- a/augur/static/css/stylesheet.css +++ b/collectoss/static/css/stylesheet.css @@ -302,10 +302,8 @@ img[id^="report_image_"]:hover { } } -.img_placeholder { - width: 256px; - height: 256px; -} + + .card-footer-wrap { width: 256px; diff --git a/augur/static/favicon/android-chrome-192x192.png b/collectoss/static/favicon/android-chrome-192x192.png similarity index 100% rename from augur/static/favicon/android-chrome-192x192.png rename to collectoss/static/favicon/android-chrome-192x192.png diff --git a/augur/static/favicon/android-chrome-512x512.png b/collectoss/static/favicon/android-chrome-512x512.png similarity index 100% rename from augur/static/favicon/android-chrome-512x512.png rename to collectoss/static/favicon/android-chrome-512x512.png diff --git a/augur/static/favicon/apple-touch-icon.png b/collectoss/static/favicon/apple-touch-icon.png similarity index 100% rename from augur/static/favicon/apple-touch-icon.png rename to collectoss/static/favicon/apple-touch-icon.png diff --git a/augur/static/favicon/favicon-16x16.png b/collectoss/static/favicon/favicon-16x16.png similarity index 100% rename from augur/static/favicon/favicon-16x16.png rename to collectoss/static/favicon/favicon-16x16.png diff --git a/augur/static/favicon/favicon-32x32.png b/collectoss/static/favicon/favicon-32x32.png similarity index 100% rename from augur/static/favicon/favicon-32x32.png rename to collectoss/static/favicon/favicon-32x32.png diff --git a/augur/static/favicon/favicon.ico b/collectoss/static/favicon/favicon.ico similarity index 100% rename from augur/static/favicon/favicon.ico rename to collectoss/static/favicon/favicon.ico diff --git a/augur/static/favicon/favicon.png b/collectoss/static/favicon/favicon.png similarity index 100% rename from augur/static/favicon/favicon.png rename to collectoss/static/favicon/favicon.png diff --git a/augur/static/favicon/favicon_source.svg b/collectoss/static/favicon/favicon_source.svg similarity index 100% rename from augur/static/favicon/favicon_source.svg rename to collectoss/static/favicon/favicon_source.svg diff --git a/augur/static/favicon/site.webmanifest b/collectoss/static/favicon/site.webmanifest similarity index 100% rename from augur/static/favicon/site.webmanifest rename to collectoss/static/favicon/site.webmanifest diff --git a/augur/static/img/Chaoss_Logo.png b/collectoss/static/img/Chaoss_Logo.png similarity index 100% rename from augur/static/img/Chaoss_Logo.png rename to collectoss/static/img/Chaoss_Logo.png diff --git a/augur/static/img/Chaoss_Logo_white.png b/collectoss/static/img/Chaoss_Logo_white.png similarity index 100% rename from augur/static/img/Chaoss_Logo_white.png rename to collectoss/static/img/Chaoss_Logo_white.png diff --git a/collectoss/static/img/collectoss-logo-black.svg b/collectoss/static/img/collectoss-logo-black.svg new file mode 100644 index 000000000..bfa1123b1 --- /dev/null +++ b/collectoss/static/img/collectoss-logo-black.svg @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/collectoss/static/img/collectoss-logo-white.svg b/collectoss/static/img/collectoss-logo-white.svg new file mode 100644 index 000000000..a886782f9 --- /dev/null +++ b/collectoss/static/img/collectoss-logo-white.svg @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/collectoss/static/img/collectoss_logo.png b/collectoss/static/img/collectoss_logo.png new file mode 100644 index 000000000..8c994c5de Binary files /dev/null and b/collectoss/static/img/collectoss_logo.png differ diff --git a/collectoss/static/img/collectoss_logo_black.png b/collectoss/static/img/collectoss_logo_black.png new file mode 100644 index 000000000..d72817e78 Binary files /dev/null and b/collectoss/static/img/collectoss_logo_black.png differ diff --git a/augur/static/img/notification-icon.svg b/collectoss/static/img/notification-icon.svg similarity index 100% rename from augur/static/img/notification-icon.svg rename to collectoss/static/img/notification-icon.svg diff --git a/augur/static/img/tswiftjet.png b/collectoss/static/img/tswiftjet.png similarity index 100% rename from augur/static/img/tswiftjet.png rename to collectoss/static/img/tswiftjet.png diff --git a/augur/static/js/range.js b/collectoss/static/js/range.js similarity index 100% rename from augur/static/js/range.js rename to collectoss/static/js/range.js diff --git a/augur/static/js/sleep.js b/collectoss/static/js/sleep.js similarity index 100% rename from augur/static/js/sleep.js rename to collectoss/static/js/sleep.js diff --git a/augur/static/js/textarea_resize.js b/collectoss/static/js/textarea_resize.js similarity index 100% rename from augur/static/js/textarea_resize.js rename to collectoss/static/js/textarea_resize.js diff --git a/augur/tasks/__init__.py b/collectoss/tasks/__init__.py similarity index 100% rename from augur/tasks/__init__.py rename to collectoss/tasks/__init__.py diff --git a/augur/tasks/data_analysis/__init__.py b/collectoss/tasks/data_analysis/__init__.py similarity index 53% rename from augur/tasks/data_analysis/__init__.py rename to collectoss/tasks/data_analysis/__init__.py index 0db9a97ec..ff816e600 100644 --- a/augur/tasks/data_analysis/__init__.py +++ b/collectoss/tasks/data_analysis/__init__.py @@ -2,11 +2,11 @@ import logging def machine_learning_phase(repo_git, full_collection): - from augur.tasks.data_analysis.clustering_worker.tasks import clustering_task - from augur.tasks.data_analysis.discourse_analysis.tasks import discourse_analysis_task - from augur.tasks.data_analysis.insight_worker.tasks import insight_task - from augur.tasks.data_analysis.message_insights.tasks import message_insight_task - from augur.tasks.data_analysis.pull_request_analysis_worker.tasks import pull_request_analysis_task + from collectoss.tasks.data_analysis.clustering_worker.tasks import clustering_task + from collectoss.tasks.data_analysis.discourse_analysis.tasks import discourse_analysis_task + from collectoss.tasks.data_analysis.insight_worker.tasks import insight_task + from collectoss.tasks.data_analysis.message_insights.tasks import message_insight_task + from collectoss.tasks.data_analysis.pull_request_analysis_worker.tasks import pull_request_analysis_task logger = logging.getLogger(machine_learning_phase.__name__) diff --git a/augur/tasks/data_analysis/clustering_worker/__init__.py b/collectoss/tasks/data_analysis/clustering_worker/__init__.py similarity index 100% rename from augur/tasks/data_analysis/clustering_worker/__init__.py rename to collectoss/tasks/data_analysis/clustering_worker/__init__.py diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/collectoss/tasks/data_analysis/clustering_worker/tasks.py similarity index 97% rename from augur/tasks/data_analysis/clustering_worker/tasks.py rename to collectoss/tasks/data_analysis/clustering_worker/tasks.py index d548ecf10..c9e269e5f 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/collectoss/tasks/data_analysis/clustering_worker/tasks.py @@ -19,17 +19,17 @@ from textblob import TextBlob from collections import Counter -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git -from augur.application.db.models import RepoClusterMessage, RepoTopic, TopicWord -from augur.tasks.init.celery_app import AugurMlRepoCollectionTask +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.lib import get_value, get_session, get_repo_by_repo_git +from collectoss.application.db.models import RepoClusterMessage, RepoTopic, TopicWord +from collectoss.tasks.init.celery_app import MLRepoCollectionTask MODEL_FILE_NAME = "kmeans_repo_messages" stemmer = nltk.stem.snowball.SnowballStemmer("english") -@celery.task(base=AugurMlRepoCollectionTask, bind=True) +@celery.task(base=MLRepoCollectionTask, bind=True) def clustering_task(self, repo_git): logger = logging.getLogger(clustering_model.__name__) @@ -51,7 +51,7 @@ def clustering_model(repo_git: str,logger,engine) -> None: tool_source = 'Clustering Worker' tool_version = '0.2.0' - data_source = 'Augur Collected Messages' + data_source = 'Collected Messages' repo_id = get_repo_by_repo_git(repo_git).repo_id @@ -62,7 +62,7 @@ def clustering_model(repo_git: str,logger,engine) -> None: logger.info(f"Min df: {min_df}. Max df: {max_df}") - logger.info("If you did not install NLTK libraries when you installed Augur, this will fail. ") + logger.info("If you did not install NLTK libraries when you installed CollectOSS, this will fail. ") #nltk.download('all') logger.info(f"Getting repo messages for repo_id: {repo_id}") diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/__init__.py b/collectoss/tasks/data_analysis/contributor_breadth_worker/__init__.py similarity index 100% rename from augur/tasks/data_analysis/contributor_breadth_worker/__init__.py rename to collectoss/tasks/data_analysis/contributor_breadth_worker/__init__.py diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py similarity index 77% rename from augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py rename to collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 896ccd61d..108326b50 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -3,13 +3,13 @@ import sqlalchemy as s from datetime import datetime -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException -from augur.application.db.models import ContributorRepo -from augur.application.db.lib import bulk_insert_dicts -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from collectoss.application.db.models import ContributorRepo +from collectoss.application.db.lib import bulk_insert_dicts +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -### This worker scans all the platform users in Augur, and pulls their platform activity +### This worker scans all the platform users in CollectOSS, and pulls their platform activity ### logs. Those are then used to analyze what repos each is working in (which will include repos not ### tracked in the Augur instance.) ### Logic: For each unique platform contributor, gather non duplicate events, using the GitHub "id" @@ -20,7 +20,7 @@ def contributor_breadth_model(self) -> None: engine = self.app.engine - logger = logging.getLogger(contributor_breadth_model.__name__) + logger = logging.getLogger(__name__) tool_source = 'Contributor Breadth Worker' tool_version = '0.0.1' @@ -89,7 +89,7 @@ def contributor_breadth_model(self) -> None: total = len(current_cntrb_logins) for cntrb in current_cntrb_logins: - print(f"Processing cntrb {index} of {total}") + logger.info(f"Processing cntrb {index} of {total}") index += 1 repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" @@ -129,18 +129,32 @@ def process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_ve cntrb_repos_insert = [] for event_id_api in cntrb_events: - + + repo = event_id_api.get("repo") + + if not repo or not repo.get("url"): + logger.warning( + "Skipping GitHub event due to empty repo or missing repo.url", + extra={ + "event_id": event_id_api.get("id"), + "event_type": event_id_api.get("type"), + "public": event_id_api.get("public"), + "repo": repo, + }, + ) + continue + cntrb_repos_insert.append({ "cntrb_id": cntrb['cntrb_id'], - "repo_git": event_id_api['repo']['url'], + "repo_git": repo['url'], "tool_source": tool_source, "tool_version": tool_version, "data_source": data_source, - "repo_name": event_id_api['repo']['name'], - "gh_repo_id": event_id_api['repo']['id'], - "cntrb_category": event_id_api['type'], + "repo_name": repo['name'], + "gh_repo_id": repo['id'], + "cntrb_category": event_id_api.get('type'), "event_id": int(event_id_api['id']), - "created_at": event_id_api['created_at'] + "created_at": event_id_api.get('created_at') }) return cntrb_repos_insert diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/tasks.py b/collectoss/tasks/data_analysis/contributor_breadth_worker/tasks.py similarity index 100% rename from augur/tasks/data_analysis/contributor_breadth_worker/tasks.py rename to collectoss/tasks/data_analysis/contributor_breadth_worker/tasks.py diff --git a/augur/tasks/data_analysis/discourse_analysis/__init__.py b/collectoss/tasks/data_analysis/discourse_analysis/__init__.py similarity index 100% rename from augur/tasks/data_analysis/discourse_analysis/__init__.py rename to collectoss/tasks/data_analysis/discourse_analysis/__init__.py diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py similarity index 93% rename from augur/tasks/data_analysis/discourse_analysis/tasks.py rename to collectoss/tasks/data_analysis/discourse_analysis/tasks.py index e78e030e6..a95756b8c 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py @@ -7,11 +7,11 @@ import os from collections import Counter -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_session, get_repo_by_repo_git -from augur.application.db.models import Repo, DiscourseInsight -from augur.application.db.util import execute_session_query -from augur.tasks.init.celery_app import AugurMlRepoCollectionTask +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.lib import get_session, get_repo_by_repo_git +from collectoss.application.db.models import Repo, DiscourseInsight +from collectoss.application.db.util import execute_session_query +from collectoss.tasks.init.celery_app import MLRepoCollectionTask #import os, sys, time, requests, json # from sklearn.model_selection import train_test_split @@ -29,10 +29,10 @@ # from os import path stemmer = nltk.stem.snowball.SnowballStemmer("english") -ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) -DISCOURSE_ANALYSIS_DIR = f"{ROOT_AUGUR_DIRECTORY}/tasks/data_analysis/discourse_analysis/" +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +DISCOURSE_ANALYSIS_DIR = f"{ROOT_PROJECT_REPO_DIRECTORY}/tasks/data_analysis/discourse_analysis/" -@celery.task(base=AugurMlRepoCollectionTask, bind=True) +@celery.task(base=MLRepoCollectionTask, bind=True) def discourse_analysis_task(self, repo_git): logger = logging.getLogger(discourse_analysis_task.__name__) diff --git a/augur/tasks/data_analysis/discourse_analysis/tfidf_transformer b/collectoss/tasks/data_analysis/discourse_analysis/tfidf_transformer similarity index 100% rename from augur/tasks/data_analysis/discourse_analysis/tfidf_transformer rename to collectoss/tasks/data_analysis/discourse_analysis/tfidf_transformer diff --git a/augur/tasks/data_analysis/discourse_analysis/trained_crf_model b/collectoss/tasks/data_analysis/discourse_analysis/trained_crf_model similarity index 100% rename from augur/tasks/data_analysis/discourse_analysis/trained_crf_model rename to collectoss/tasks/data_analysis/discourse_analysis/trained_crf_model diff --git a/augur/tasks/data_analysis/discourse_analysis/word_to_emotion_map b/collectoss/tasks/data_analysis/discourse_analysis/word_to_emotion_map similarity index 100% rename from augur/tasks/data_analysis/discourse_analysis/word_to_emotion_map rename to collectoss/tasks/data_analysis/discourse_analysis/word_to_emotion_map diff --git a/collectoss/tasks/data_analysis/insight_worker/__init__.py b/collectoss/tasks/data_analysis/insight_worker/__init__.py new file mode 100644 index 000000000..60bc554ea --- /dev/null +++ b/collectoss/tasks/data_analysis/insight_worker/__init__.py @@ -0,0 +1,5 @@ +#SPDX-License-Identifier: MIT +"""collectoss_worker_github - CollectOSS Worker that collects GitHub data""" + +__version__ = '0.0.2' +__all__ = [] diff --git a/augur/tasks/data_analysis/insight_worker/tasks.py b/collectoss/tasks/data_analysis/insight_worker/tasks.py similarity index 98% rename from augur/tasks/data_analysis/insight_worker/tasks.py rename to collectoss/tasks/data_analysis/insight_worker/tasks.py index 97a6580d6..123407c58 100644 --- a/augur/tasks/data_analysis/insight_worker/tasks.py +++ b/collectoss/tasks/data_analysis/insight_worker/tasks.py @@ -9,15 +9,15 @@ from sklearn.ensemble import IsolationForest import warnings -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session -from augur.application.db.models import ChaossMetricStatus, RepoInsight, RepoInsightsRecord -from augur.tasks.init.celery_app import AugurMlRepoCollectionTask +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.lib import get_value, get_repo_by_repo_git, get_session +from collectoss.application.db.models import ChaossMetricStatus, RepoInsight, RepoInsightsRecord +from collectoss.tasks.init.celery_app import MLRepoCollectionTask warnings.filterwarnings('ignore') -@celery.task(base=AugurMlRepoCollectionTask, bind=True) +@celery.task(base=MLRepoCollectionTask, bind=True) def insight_task(self, repo_git): logger = logging.getLogger(insight_task.__name__) @@ -32,7 +32,7 @@ def insight_model(repo_git: str,logger,engine) -> None: tool_source = 'Insight Worker' tool_version = '1.0.0' - data_source = 'Augur API' + data_source = 'Built-in API' metrics = {"issues-new": "issues", "code-changes": "commit_count", "code-changes-lines": "added", "reviews": "pull_requests", "contributors-new": "new_contributors"} @@ -106,7 +106,7 @@ def insight_model(repo_git: str,logger,engine) -> None: AND ri_date < :min_date """) - with engine.connect() as conn: + with engine.begin() as conn: result = conn.execute(delete_record_SQL, parameters=dict(repo_id=repo_id, min_date=min_date)) logger.info("Deleting out of date data points ...\n") @@ -320,7 +320,7 @@ def confidence_interval_insights(logger, engine): # """ Query all endpoints """ # endpointSQL = s.sql.text(""" - # SELECT * FROM chaoss_metric_status WHERE cm_source = 'augur_db' + # SELECT * FROM chaoss_metric_status WHERE cm_source = 'collectoss_db' # """) #with DatabaseEngine(connection_pool_size=1) as engine: # for endpoint in pd.read_sql(endpointSQL,engine, params={}).to_records(): diff --git a/augur/tasks/data_analysis/message_insights/README.md b/collectoss/tasks/data_analysis/message_insights/README.md similarity index 95% rename from augur/tasks/data_analysis/message_insights/README.md rename to collectoss/tasks/data_analysis/message_insights/README.md index 185d1396b..d11c3193b 100644 --- a/augur/tasks/data_analysis/message_insights/README.md +++ b/collectoss/tasks/data_analysis/message_insights/README.md @@ -14,9 +14,9 @@ To kickstart the worker, it needs to receive a task from the Housekeeper, simila The standard options are: -- ``switch`` - a boolean flag indicating if the worker should automatically be started with Augur. Defaults to ``0`` (false). -- ``workers`` - the number of instances of this worker that Augur should spawn if ``switch`` is set to ``1``. Defaults to ``1``. -- ``port`` - the TCP port the worker will use to communicate with Augur’s broker, the default being ``51300``. +- ``switch`` - a boolean flag indicating if the worker should automatically be started with CollectOSS. Defaults to ``0`` (false). +- ``workers`` - the number of instances of this worker that CollectOSS should spawn if ``switch`` is set to ``1``. Defaults to ``1``. +- ``port`` - the TCP port the worker will use to communicate with CollectOSS’s broker, the default being ``51300``. - ``insight_days`` - the most recent period (in days) for which insights would be calculated. - ``models_dir`` - the directory within the worker directory, where all trained machine learning models would be stored. diff --git a/collectoss/tasks/data_analysis/message_insights/__init__.py b/collectoss/tasks/data_analysis/message_insights/__init__.py new file mode 100644 index 000000000..e9cb1e18f --- /dev/null +++ b/collectoss/tasks/data_analysis/message_insights/__init__.py @@ -0,0 +1,6 @@ +#SPDX-License-Identifier: MIT + +"""message_insights - CollectOSS Worker that analyzes PR and issue messages""" + +__version__ = '0.3.1' +__all__ = [] diff --git a/augur/tasks/data_analysis/message_insights/message_models/directory.md b/collectoss/tasks/data_analysis/message_insights/message_models/directory.md similarity index 100% rename from augur/tasks/data_analysis/message_insights/message_models/directory.md rename to collectoss/tasks/data_analysis/message_insights/message_models/directory.md diff --git a/augur/tasks/data_analysis/message_insights/message_models/message_models.md b/collectoss/tasks/data_analysis/message_insights/message_models/message_models.md similarity index 100% rename from augur/tasks/data_analysis/message_insights/message_models/message_models.md rename to collectoss/tasks/data_analysis/message_insights/message_models/message_models.md diff --git a/augur/tasks/data_analysis/message_insights/message_novelty.py b/collectoss/tasks/data_analysis/message_insights/message_novelty.py similarity index 95% rename from augur/tasks/data_analysis/message_insights/message_novelty.py rename to collectoss/tasks/data_analysis/message_insights/message_novelty.py index 7821cbf27..bb5531b04 100644 --- a/augur/tasks/data_analysis/message_insights/message_novelty.py +++ b/collectoss/tasks/data_analysis/message_insights/message_novelty.py @@ -13,13 +13,13 @@ from skimage.filters import threshold_otsu from sklearn import utils as skl_utils -from augur.tasks.data_analysis.message_insights.preprocess_text import \ +from collectoss.tasks.data_analysis.message_insights.preprocess_text import \ normalize_corpus as normalize_corpus -ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) -train_path = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", "train_data") +train_path = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", "train_data") # ''' Doc2Vec model training diff --git a/augur/tasks/data_analysis/message_insights/message_sentiment.py b/collectoss/tasks/data_analysis/message_insights/message_sentiment.py similarity index 97% rename from augur/tasks/data_analysis/message_insights/message_sentiment.py rename to collectoss/tasks/data_analysis/message_insights/message_sentiment.py index 73c077f5f..4ce60c7d6 100644 --- a/augur/tasks/data_analysis/message_insights/message_sentiment.py +++ b/collectoss/tasks/data_analysis/message_insights/message_sentiment.py @@ -25,16 +25,16 @@ # from openpyxl import load_workbook -from augur.tasks.data_analysis.message_insights.preprocess_text import \ +from collectoss.tasks.data_analysis.message_insights.preprocess_text import \ CONTRACTION_MAP as contraction_map warnings.filterwarnings('ignore') -ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) CONTRACTION_MAP = contraction_map -train_path = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", "train_data") +train_path = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", "train_data") def replace_all(text, dic): if(sys.version_info[0] < 3): diff --git a/augur/tasks/data_analysis/message_insights/preprocess_text.py b/collectoss/tasks/data_analysis/message_insights/preprocess_text.py similarity index 100% rename from augur/tasks/data_analysis/message_insights/preprocess_text.py rename to collectoss/tasks/data_analysis/message_insights/preprocess_text.py diff --git a/augur/tasks/data_analysis/message_insights/tasks.py b/collectoss/tasks/data_analysis/message_insights/tasks.py similarity index 96% rename from augur/tasks/data_analysis/message_insights/tasks.py rename to collectoss/tasks/data_analysis/message_insights/tasks.py index fe12bb960..7913a5d13 100644 --- a/augur/tasks/data_analysis/message_insights/tasks.py +++ b/collectoss/tasks/data_analysis/message_insights/tasks.py @@ -8,19 +8,19 @@ from skimage.filters import threshold_otsu from sklearn.ensemble import IsolationForest -from augur.tasks.data_analysis.message_insights.message_novelty import novelty_analysis -from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score +from collectoss.tasks.data_analysis.message_insights.message_novelty import novelty_analysis +from collectoss.tasks.data_analysis.message_insights.message_sentiment import get_senti_score -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session -from augur.application.db.models import MessageAnalysis, MessageAnalysisSummary -from augur.tasks.init.celery_app import AugurMlRepoCollectionTask +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.lib import get_value, get_repo_by_repo_git, get_session +from collectoss.application.db.models import MessageAnalysis, MessageAnalysisSummary +from collectoss.tasks.init.celery_app import MLRepoCollectionTask #SPDX-License-Identifier: MIT -ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) -@celery.task(base=AugurMlRepoCollectionTask, bind=True) +@celery.task(base=MLRepoCollectionTask, bind=True) def message_insight_task(self, repo_git): logger = logging.getLogger(message_insight_task.__name__) @@ -45,7 +45,7 @@ def message_insight_model(repo_git: str,logger,engine) -> None: repo = get_repo_by_repo_git(repo_git) repo_id = repo.repo_id - models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) + models_dir = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) insight_days = get_value("Message_Insights", 'insight_days') # Any initial database instructions, like finding the last tuple inserted or generate the next ID value diff --git a/augur/tasks/data_analysis/message_insights/train_data/.gitkeep b/collectoss/tasks/data_analysis/message_insights/train_data/.gitkeep similarity index 100% rename from augur/tasks/data_analysis/message_insights/train_data/.gitkeep rename to collectoss/tasks/data_analysis/message_insights/train_data/.gitkeep diff --git a/augur/tasks/data_analysis/message_insights/train_data/EmoticonLookupTable.txt b/collectoss/tasks/data_analysis/message_insights/train_data/EmoticonLookupTable.txt similarity index 100% rename from augur/tasks/data_analysis/message_insights/train_data/EmoticonLookupTable.txt rename to collectoss/tasks/data_analysis/message_insights/train_data/EmoticonLookupTable.txt diff --git a/augur/tasks/data_analysis/message_insights/train_data/README.md b/collectoss/tasks/data_analysis/message_insights/train_data/README.md similarity index 100% rename from augur/tasks/data_analysis/message_insights/train_data/README.md rename to collectoss/tasks/data_analysis/message_insights/train_data/README.md diff --git a/augur/tasks/data_analysis/message_insights/train_data/directory.md b/collectoss/tasks/data_analysis/message_insights/train_data/directory.md similarity index 100% rename from augur/tasks/data_analysis/message_insights/train_data/directory.md rename to collectoss/tasks/data_analysis/message_insights/train_data/directory.md diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/.gitkeep b/collectoss/tasks/data_analysis/pull_request_analysis_worker/.gitkeep similarity index 100% rename from augur/tasks/data_analysis/pull_request_analysis_worker/.gitkeep rename to collectoss/tasks/data_analysis/pull_request_analysis_worker/.gitkeep diff --git a/collectoss/tasks/data_analysis/pull_request_analysis_worker/__init__.py b/collectoss/tasks/data_analysis/pull_request_analysis_worker/__init__.py new file mode 100644 index 000000000..efb3705ba --- /dev/null +++ b/collectoss/tasks/data_analysis/pull_request_analysis_worker/__init__.py @@ -0,0 +1,4 @@ +"""message_insights_worker - CollectOSS Worker that predicts acceptance of a PR""" + +__version__ = '0.0.0' +__all__ = [] \ No newline at end of file diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py similarity index 93% rename from augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py rename to collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py index 2347eb109..aa8d5a0a0 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -6,21 +6,21 @@ import pandas as pd import sqlalchemy as s -from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score +from collectoss.tasks.data_analysis.message_insights.message_sentiment import get_senti_score -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git -from augur.application.db.models import PullRequestAnalysis -from augur.tasks.init.celery_app import AugurMlRepoCollectionTask +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.lib import get_value, get_session, get_repo_by_repo_git +from collectoss.application.db.models import PullRequestAnalysis +from collectoss.tasks.init.celery_app import MLRepoCollectionTask # from sklearn.metrics import (confusion_matrix, f1_score, precision_score, recall_score) # from sklearn.preprocessing import LabelEncoder, MinMaxScaler # from xgboost import XGBClassifier -ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) -@celery.task(base=AugurMlRepoCollectionTask, bind=True) +@celery.task(base=MLRepoCollectionTask, bind=True) def pull_request_analysis_task(self, repo_git): logger = logging.getLogger(pull_request_analysis_task.__name__) @@ -40,7 +40,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: repo_id = get_repo_by_repo_git(repo_git).repo_id - senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) + senti_models_dir = os.path.join(ROOT_PROJECT_REPO_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) logger.info(f'Sentiment model dir located - {senti_models_dir}') diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/trained_pr_model.pkl b/collectoss/tasks/data_analysis/pull_request_analysis_worker/trained_pr_model.pkl similarity index 100% rename from augur/tasks/data_analysis/pull_request_analysis_worker/trained_pr_model.pkl rename to collectoss/tasks/data_analysis/pull_request_analysis_worker/trained_pr_model.pkl diff --git a/augur/tasks/db/__init__.py b/collectoss/tasks/db/__init__.py similarity index 100% rename from augur/tasks/db/__init__.py rename to collectoss/tasks/db/__init__.py diff --git a/augur/tasks/db/refresh_materialized_views.py b/collectoss/tasks/db/refresh_materialized_views.py similarity index 92% rename from augur/tasks/db/refresh_materialized_views.py rename to collectoss/tasks/db/refresh_materialized_views.py index 37e3ef561..95f169722 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/collectoss/tasks/db/refresh_materialized_views.py @@ -2,16 +2,16 @@ import logging import sqlalchemy as s -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import execute_sql -from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper -from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import invalidate_caches, rebuild_unknown_affiliation_and_web_caches +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.lib import execute_sql +from collectoss.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper +from collectoss.tasks.git.util.facade_worker.facade_worker.rebuildcache import invalidate_caches, rebuild_unknown_affiliation_and_web_caches @celery.task(bind=True) def refresh_materialized_views(self): - #self.logger = AugurLogger("data_collection_jobs").get_logger() + #self.logger = SystemLogger("data_collection_jobs").get_logger() engine = self.app.engine @@ -187,14 +187,14 @@ def refresh_materialized_views(self): logger.error("Nuke stored affiliations is deprecated!") # deprecated because the UI component of facade where affiliations would be # nuked upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. + # from queries and materialized views in the current version of CollectOSS. # This method is also a major performance bottleneck with little value. if not facade_helper.limited_run or (facade_helper.limited_run and facade_helper.fix_affiliations): logger.error("Fill empty affiliations is deprecated!") # deprecated because the UI component of facade where affiliations would need # to be fixed upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. + # from queries and materialized views in the current version of CollectOSS. # This method is also a major performance bottleneck with little value. if facade_helper.force_invalidate_caches: diff --git a/augur/tasks/frontend.py b/collectoss/tasks/frontend.py similarity index 89% rename from augur/tasks/frontend.py rename to collectoss/tasks/frontend.py index d1a391814..4ed2e24aa 100644 --- a/augur/tasks/frontend.py +++ b/collectoss/tasks/frontend.py @@ -5,15 +5,15 @@ from time import sleep -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.github.util.github_task_session import GithubTaskSession -from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess -from augur.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_github_repo_by_src_id, get_gitlab_repo_by_src_id -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, CollectionStatus -from augur.tasks.github.util.github_paginator import hit_api +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.github.util.github_task_session import GithubTaskSession +from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from collectoss.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_github_repo_by_src_id, get_gitlab_repo_by_src_id +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, CollectionStatus +from collectoss.tasks.github.util.github_paginator import hit_api -from augur.application.db.models import UserRepo, Repo +from collectoss.application.db.models import UserRepo, Repo def parse_org_name(string): @@ -111,7 +111,6 @@ def add_gitlab_repos(user_id, group_name, repo_urls): if existing_repo.repo_group_id != repo_group_id: update_existing_repos_repo_group_id(session, existing_repo.repo_id, repo_group_id) - # TODO: add logic to update the existing records repo_group_id if it isn't equal to the existing record add_existing_repo_to_group(logger, session, group_id, existing_repo.repo_id) continue @@ -150,7 +149,6 @@ def get_org_repo_data(orgs, session): return repo_data -# TODO: Do we need to check if the repo already exists in the user group? def add_new_github_repos(repo_data, group_id, session, logger): # get data for repos to determine type, src id, and if they exist @@ -198,7 +196,6 @@ def divide_list_into_chunks(data, size): yield data[i:i + size] -# TODO: Make it only get like 100 at a time def get_github_repos_data(repo_data, session, logger): repo_urls = [x[0] for x in repo_data] @@ -364,42 +361,3 @@ def update_existing_repos_repo_group_id(session, repo_id, new_repo_group_id): # invalid_urls.append(url) # return valid_orgs, valid_repos, invalid_urls - - - - - -# TODO: Change to github specific -# @celery.task -# def add_repo(user_id, group_name, repo_url): - -# logger = logging.getLogger(add_org.__name__) - -# with GithubTaskSession(logger) as session: -# result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) - -# print(repo_url, result) - - -# # TODO: Change to github specific -# @celery.task -# def add_org(user_id, group_name, org_url): - -# logger = logging.getLogger(add_org.__name__) - -# with GithubTaskSession(logger) as session: -# result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) - -# print(org_url, result) - - - - - - - - - - - - diff --git a/augur/tasks/git/__init__.py b/collectoss/tasks/git/__init__.py similarity index 100% rename from augur/tasks/git/__init__.py rename to collectoss/tasks/git/__init__.py diff --git a/augur/tasks/git/dependency_libyear_tasks/__init__.py b/collectoss/tasks/git/dependency_libyear_tasks/__init__.py similarity index 100% rename from augur/tasks/git/dependency_libyear_tasks/__init__.py rename to collectoss/tasks/git/dependency_libyear_tasks/__init__.py diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/collectoss/tasks/git/dependency_libyear_tasks/core.py similarity index 90% rename from augur/tasks/git/dependency_libyear_tasks/core.py rename to collectoss/tasks/git/dependency_libyear_tasks/core.py index ba31fac09..56b8f1a5b 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/collectoss/tasks/git/dependency_libyear_tasks/core.py @@ -1,8 +1,8 @@ from datetime import datetime -from augur.application.db.models import * -from augur.application.db.lib import get_value, bulk_insert_dicts, get_repo_by_repo_git -from augur.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from collectoss.application.db.models import * +from collectoss.application.db.lib import get_value, bulk_insert_dicts, get_repo_by_repo_git +from collectoss.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data +from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path def deps_libyear_model(logger,repo_git): """ Data collection and storage method diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/__init__.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/__init__.py similarity index 100% rename from augur/tasks/git/dependency_libyear_tasks/libyear_util/__init__.py rename to collectoss/tasks/git/dependency_libyear_tasks/libyear_util/__init__.py diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py similarity index 100% rename from augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py rename to collectoss/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_parser.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/npm_parser.py similarity index 100% rename from augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_parser.py rename to collectoss/tasks/git/dependency_libyear_tasks/libyear_util/npm_parser.py diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py similarity index 100% rename from augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py rename to collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py similarity index 100% rename from augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py rename to collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py similarity index 61% rename from augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py rename to collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py index cf40b9f73..372e64c82 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -1,9 +1,9 @@ import dateutil.parser import os -from augur.tasks.git.dependency_libyear_tasks.libyear_util.pypi_parser import parse_conda, parse_pipfile,parse_pipfile_lock,parse_poetry,parse_poetry_lock,parse_requirement_txt,parse_setup_py -from augur.tasks.git.dependency_libyear_tasks.libyear_util.npm_parser import parse_package_json -from augur.tasks.git.dependency_libyear_tasks.libyear_util.pypi_libyear_util import sort_dependency_requirement,get_pypi_data,get_latest_version,get_release_date -from augur.tasks.git.dependency_libyear_tasks.libyear_util.npm_libyear_utils import get_NPM_data, get_npm_release_date, get_npm_latest_version,get_npm_current_version +from collectoss.tasks.git.dependency_libyear_tasks.libyear_util.pypi_parser import parse_conda, parse_pipfile,parse_pipfile_lock,parse_poetry,parse_poetry_lock,parse_requirement_txt,parse_setup_py +from collectoss.tasks.git.dependency_libyear_tasks.libyear_util.npm_parser import parse_package_json +from collectoss.tasks.git.dependency_libyear_tasks.libyear_util.pypi_libyear_util import sort_dependency_requirement,get_pypi_data,get_latest_version,get_release_date +from collectoss.tasks.git.dependency_libyear_tasks.libyear_util.npm_libyear_utils import get_NPM_data, get_npm_release_date, get_npm_latest_version,get_npm_current_version #Files That would be parsed should be added here file_list = [ @@ -111,69 +111,73 @@ def get_deps_libyear_data(path, logger): #NOTE: Add new if for new package parser if dependency['package'] == 'PYPI': data = get_pypi_data(dependency['name']) + if not data: + logger.warning(f"Skipping dependency {dependency['name']} - could not find package on PYPI.") + continue + try: current_version = sort_dependency_requirement(dependency,data) except (KeyError, TypeError) as e: - logger.error(f"Could not get current version of dependency for path {path}.\n Dependency: {dependency}") - current_version = None + logger.warning(f"Skipping dependency {dependency['name']} - could not resolve current version requirement: {dependency['requirement']}") + continue + + if not current_version: + logger.warning(f"Skipping dependency {dependency['name']} - current version is null or unspecified.") + continue + try: latest_version = get_latest_version(data) - - except KeyError: - logger.error(f"Could not get current version of dependency for path {path}.\n Dependency: {dependency}") - latest_version = None + except (KeyError, TypeError): + logger.warning(f"Skipping dependency {dependency['name']} - could not get latest version from PYPI.") + continue + if not latest_version: + logger.warning(f"Skipping dependency {dependency['name']} - latest version is null.") + continue + try: - if latest_version: - latest_release_date = get_release_date(data, latest_version,logger) - else: - latest_release_date = None + latest_release_date = get_release_date(data, latest_version, logger) + current_release_date = get_release_date(data, current_version, logger) except KeyError: - logger.error(f"Could not get current date of dependency for path {path} with version {latest_version}.\n Dependency: {dependency}") - latest_release_date = None - - if current_version: - current_release_date = get_release_date(data, current_version,logger) + logger.warning(f"Skipping dependency {dependency['name']} - could not find release dates for version {current_version} or {latest_version}.") + continue + + if not current_release_date or not latest_release_date: + logger.warning(f"Skipping dependency {dependency['name']} - missing release date information.") + continue elif dependency['package'] == 'NPM': data = get_NPM_data(dependency['name']) + if not data: + logger.warning(f"Skipping dependency {dependency['name']} - could not find package on NPM.") + continue + current_version = get_npm_current_version(data, dependency['requirement']) + if not current_version: + logger.warning(f"Skipping dependency {dependency['name']} - could not resolve current version from requirement: {dependency['requirement']}") + continue + try: latest_version = get_npm_latest_version(data) except KeyError: - logger.error(f"Could not get latest version of dependency for path {path}.\n Dependency: {dependency}") + logger.warning(f"Skipping dependency {dependency['name']} - could not get latest version from NPM.") latest_version = None + if not latest_version: + continue + try: - if latest_version: - latest_release_date = get_npm_release_date(data, latest_version) - else: - latest_release_date = None + latest_release_date = get_npm_release_date(data, latest_version) + current_release_date = get_npm_release_date(data, current_version) except KeyError: - logger.error(f"Could not get latest version of dependency for path {path}.\n Dependency: {dependency}") - latest_release_date = None - - if current_version: - try: - current_release_date = get_npm_release_date(data, current_version) - except KeyError: - logger.error(f"Could not get latest version of dependency for path {path}.\n Dependency: {dependency}") - current_release_date = dateutil.parser.parse('1970-01-01 00:00:00') + logger.warning(f"Skipping dependency {dependency['name']} - missing release date info on NPM for {current_version}/{latest_version}") + continue + else: + # Unsupported package manager + continue - libyear = get_libyear(current_version, current_release_date, latest_version, latest_release_date) - if not latest_release_date: - latest_release_date = dateutil.parser.parse('1970-01-01 00:00:00') - libyear = -1 - - if not latest_version: - latest_version = 'unspecified' - - if not current_version: - current_version = latest_version - current_release_date = latest_release_date - if not dependency['requirement']: dependency['requirement'] = 'unspecified' @@ -183,4 +187,4 @@ def get_deps_libyear_data(path, logger): dependency['latest_release_date'] = latest_release_date dependency['libyear'] = libyear - return [d for d in dependencies if d] \ No newline at end of file + return [d for d in dependencies if 'libyear' in d] \ No newline at end of file diff --git a/collectoss/tasks/git/dependency_libyear_tasks/tasks.py b/collectoss/tasks/git/dependency_libyear_tasks/tasks.py new file mode 100644 index 000000000..005ffb8dd --- /dev/null +++ b/collectoss/tasks/git/dependency_libyear_tasks/tasks.py @@ -0,0 +1,12 @@ +import logging +from collectoss.tasks.git.dependency_libyear_tasks.core import * +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import FacadeRepoCollectionTask + +@celery.task(base=FacadeRepoCollectionTask, bind=True) +def process_libyear_dependency_metrics(self, repo_git): + #raise NotImplementedError + + logger = logging.getLogger(process_libyear_dependency_metrics.__name__) + + deps_libyear_model(logger, repo_git) \ No newline at end of file diff --git a/augur/tasks/git/dependency_tasks/__init__.py b/collectoss/tasks/git/dependency_tasks/__init__.py similarity index 100% rename from augur/tasks/git/dependency_tasks/__init__.py rename to collectoss/tasks/git/dependency_tasks/__init__.py diff --git a/augur/tasks/git/dependency_tasks/core.py b/collectoss/tasks/git/dependency_tasks/core.py similarity index 87% rename from augur/tasks/git/dependency_tasks/core.py rename to collectoss/tasks/git/dependency_tasks/core.py index 0f9b08d7d..a9e74b4e1 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/collectoss/tasks/git/dependency_tasks/core.py @@ -1,13 +1,13 @@ from datetime import datetime import os -from augur.application.db.models import * -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session -from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler -from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc -from augur.tasks.util.worker_util import parse_json_from_subprocess_call -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.tasks.util.metadata_exception import MetadataException +from collectoss.application.db.models import * +from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session +from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from collectoss.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc +from collectoss.tasks.util.worker_util import parse_json_from_subprocess_call +from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.util.metadata_exception import MetadataException def generate_deps_data(logger, repo_git): @@ -70,7 +70,7 @@ def generate_scorecard(logger, repo_git): repo_id = repo.repo_id logger.info('Generating scorecard data for repo') - # we convert relative path in the format required by scorecard like github.com/chaoss/augur + # we convert relative path in the format required by scorecard like github.com/chaoss/collectoss # raw_path,_ = path.split('-') # scorecard_repo_path = raw_path[2:] path = repo_git[8:] diff --git a/augur/tasks/git/dependency_tasks/dependency_util/__init__.py b/collectoss/tasks/git/dependency_tasks/dependency_util/__init__.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/__init__.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/__init__.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/c_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/c_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/c_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/c_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/cpp_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/cpp_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/cpp_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/cpp_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/csharp_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/csharp_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/csharp_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/csharp_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/dependency_calculator.py b/collectoss/tasks/git/dependency_tasks/dependency_util/dependency_calculator.py similarity index 62% rename from augur/tasks/git/dependency_tasks/dependency_util/dependency_calculator.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/dependency_calculator.py index 960e00d0a..85aa681ab 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/dependency_calculator.py +++ b/collectoss/tasks/git/dependency_tasks/dependency_util/dependency_calculator.py @@ -1,16 +1,16 @@ -from augur.tasks.git.dependency_tasks.dependency_util import python_deps -from augur.tasks.git.dependency_tasks.dependency_util import ruby_deps -from augur.tasks.git.dependency_tasks.dependency_util import php_deps -from augur.tasks.git.dependency_tasks.dependency_util import javascript_deps -from augur.tasks.git.dependency_tasks.dependency_util import vb_deps -from augur.tasks.git.dependency_tasks.dependency_util import csharp_deps -from augur.tasks.git.dependency_tasks.dependency_util import java_deps -from augur.tasks.git.dependency_tasks.dependency_util import cpp_deps -from augur.tasks.git.dependency_tasks.dependency_util import c_deps -from augur.tasks.git.dependency_tasks.dependency_util import go_deps -from augur.tasks.git.dependency_tasks.dependency_util import kotlin_deps -from augur.tasks.git.dependency_tasks.dependency_util import rust_deps -from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator +from collectoss.tasks.git.dependency_tasks.dependency_util import python_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import ruby_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import php_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import javascript_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import vb_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import csharp_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import java_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import cpp_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import c_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import go_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import kotlin_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import rust_deps +from collectoss.tasks.git.dependency_tasks.dependency_util import dependency_calculator #Returns generator iterable to tuples of modules and their names def get_dependency_analysis_module_tuples(): diff --git a/augur/tasks/git/dependency_tasks/dependency_util/go_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/go_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/go_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/go_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/java_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/java_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/java_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/java_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/javascript_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/javascript_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/javascript_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/javascript_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/php_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/php_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/php_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/php_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/python_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/python_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/python_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/python_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/ruby_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/ruby_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/ruby_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/ruby_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/rust_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/rust_deps.py diff --git a/augur/tasks/git/dependency_tasks/dependency_util/vb_deps.py b/collectoss/tasks/git/dependency_tasks/dependency_util/vb_deps.py similarity index 100% rename from augur/tasks/git/dependency_tasks/dependency_util/vb_deps.py rename to collectoss/tasks/git/dependency_tasks/dependency_util/vb_deps.py diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/collectoss/tasks/git/dependency_tasks/tasks.py similarity index 74% rename from augur/tasks/git/dependency_tasks/tasks.py rename to collectoss/tasks/git/dependency_tasks/tasks.py index 731c71d00..a82296302 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/collectoss/tasks/git/dependency_tasks/tasks.py @@ -1,12 +1,12 @@ import logging import traceback -from augur.tasks.git.dependency_tasks.core import * -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.tasks.util.metadata_exception import MetadataException +from collectoss.tasks.git.dependency_tasks.core import * +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import FacadeRepoCollectionTask, SecondaryRepoCollectionTask +from collectoss.tasks.util.metadata_exception import MetadataException -@celery.task(base=AugurFacadeRepoCollectionTask) +@celery.task(base=FacadeRepoCollectionTask) def process_dependency_metrics(repo_git): logger = logging.getLogger(process_dependency_metrics.__name__) @@ -14,7 +14,7 @@ def process_dependency_metrics(repo_git): generate_deps_data(logger, repo_git) -@celery.task(base=AugurSecondaryRepoCollectionTask, bind=True) +@celery.task(base=SecondaryRepoCollectionTask, bind=True) def process_ossf_dependency_metrics(self, repo_git): engine = self.app.engine @@ -41,6 +41,6 @@ def process_ossf_dependency_metrics(self, repo_git): ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/billiard/reduction.py", line 56, in dumps cls(buf, protocol).dump(obj) - billiard.pool.MaybeEncodingError: Error sending result: ''(1, , None)''. Reason: ''PicklingError("Can\'t pickle : it\'s not the same object as augur.tasks.util.metadata_exception.MetadataException")''. + billiard.pool.MaybeEncodingError: Error sending result: ''(1, , None)''. Reason: ''PicklingError("Can\'t pickle : it\'s not the same object as collectoss.tasks.util.metadata_exception.MetadataException")''. ``` """ \ No newline at end of file diff --git a/augur/tasks/git/facade_tasks.py b/collectoss/tasks/git/facade_tasks.py similarity index 89% rename from augur/tasks/git/facade_tasks.py rename to collectoss/tasks/git/facade_tasks.py index 09f980fd4..b610e1611 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/collectoss/tasks/git/facade_tasks.py @@ -5,32 +5,32 @@ from celery import group, chain from subprocess import check_output -from augur.application.db.lib import get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits, bulk_insert_dicts, get_missing_commit_message_hashes +from collectoss.application.db.lib import get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits, bulk_insert_dicts, get_missing_commit_message_hashes -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set -from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count +from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits +from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set +from collectoss.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit +from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count -from augur.tasks.github.facade_github.tasks import * -from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper -from augur.tasks.util.collection_state import CollectionState -from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter -from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize, git_repo_updates +from collectoss.tasks.github.facade_github.tasks import * +from collectoss.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper +from collectoss.tasks.util.collection_state import CollectionState +from collectoss.tasks.util.collection_util import get_collection_status_repo_git_from_filter +from collectoss.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize, git_repo_updates -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import FacadeRepoCollectionTask -from augur.application.db.models import Repo, CollectionStatus, CommitMessage +from collectoss.application.db.models import Repo, CollectionStatus, CommitMessage -from augur.tasks.git.dependency_tasks.tasks import process_dependency_metrics -from augur.tasks.git.dependency_libyear_tasks.tasks import process_libyear_dependency_metrics -from augur.tasks.git.scc_value_tasks.tasks import process_scc_value_metrics +from collectoss.tasks.git.dependency_tasks.tasks import process_dependency_metrics +from collectoss.tasks.git.dependency_libyear_tasks.tasks import process_libyear_dependency_metrics +from collectoss.tasks.git.scc_value_tasks.tasks import process_scc_value_metrics -from augur.tasks.github.util.github_task_session import * +from collectoss.tasks.github.util.github_task_session import * def filter_null_repo_id(records, logger, context=""): """Remove and log records with null/None repo_id.""" @@ -65,7 +65,7 @@ def facade_error_handler(request,exc,traceback): #Predefine facade collection with tasks -@celery.task(base=AugurFacadeRepoCollectionTask) +@celery.task(base=FacadeRepoCollectionTask) def facade_analysis_init_facade_task(repo_git): logger = logging.getLogger(facade_analysis_init_facade_task.__name__) @@ -75,7 +75,7 @@ def facade_analysis_init_facade_task(repo_git): facade_helper.log_activity('Info',f"Beginning analysis.") -@celery.task(base=AugurFacadeRepoCollectionTask) +@celery.task(base=FacadeRepoCollectionTask) def trim_commits_facade_task(repo_git): logger = logging.getLogger(trim_commits_facade_task.__name__) @@ -102,7 +102,7 @@ def trim_commits_facade_task(repo_git): facade_helper.update_analysis_log(repo_id,'Collecting data') logger.info(f"Got past repo {repo_id}") -@celery.task(base=AugurFacadeRepoCollectionTask) +@celery.task(base=FacadeRepoCollectionTask) def trim_commits_post_analysis_facade_task(repo_git): logger = logging.getLogger(trim_commits_post_analysis_facade_task.__name__) @@ -168,7 +168,7 @@ def facade_start_contrib_analysis_task(): facade_helper.update_status('Updating Contributors') facade_helper.log_activity('Info', 'Updating Contributors with commits') -@celery.task(base=AugurFacadeRepoCollectionTask) +@celery.task(base=FacadeRepoCollectionTask) def facade_fetch_missing_commit_messages(repo_git): logger = logging.getLogger(facade_fetch_missing_commit_messages.__name__) facade_helper = FacadeHelper(logger) @@ -223,7 +223,7 @@ def facade_fetch_missing_commit_messages(repo_git): #enable celery multithreading -@celery.task(base=AugurFacadeRepoCollectionTask) +@celery.task(base=FacadeRepoCollectionTask) def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: """Take a large list of commit data to analyze and store in the database. Meant to be run in parallel with other instances of this task. """ @@ -384,7 +384,7 @@ def clone_repos(): setattr(repoStatus,"facade_status", CollectionState.ERROR.value) session.commit() - clone_repos.si().apply_async(countdown=60*5) + clone_repos.si().apply_async(countdown=60*5) #@celery.task(bind=True) @@ -397,7 +397,7 @@ def clone_repos(): # facade_helper = FacadeHelper(logger) # check_for_repo_updates(session, repo_git) -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) +@celery.task(base=FacadeRepoCollectionTask, bind=True) def git_update_commit_count_weight(self, repo_git): engine = self.app.engine @@ -412,7 +412,7 @@ def git_update_commit_count_weight(self, repo_git): update_facade_scheduling_fields(repo_git, facade_weight, commit_count) -@celery.task(base=AugurFacadeRepoCollectionTask) +@celery.task(base=FacadeRepoCollectionTask) def git_repo_updates_facade_task(repo_git): logger = logging.getLogger(git_repo_updates_facade_task.__name__) diff --git a/augur/tasks/git/scc_value_tasks/__init__.py b/collectoss/tasks/git/scc_value_tasks/__init__.py similarity index 100% rename from augur/tasks/git/scc_value_tasks/__init__.py rename to collectoss/tasks/git/scc_value_tasks/__init__.py diff --git a/augur/tasks/git/scc_value_tasks/core.py b/collectoss/tasks/git/scc_value_tasks/core.py similarity index 85% rename from augur/tasks/git/scc_value_tasks/core.py rename to collectoss/tasks/git/scc_value_tasks/core.py index 65ff4cb12..7c9e0bafd 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/collectoss/tasks/git/scc_value_tasks/core.py @@ -1,9 +1,9 @@ from datetime import datetime import os -from augur.application.db.models import * -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value -from augur.tasks.util.worker_util import parse_json_from_subprocess_call -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from collectoss.application.db.models import * +from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value +from collectoss.tasks.util.worker_util import parse_json_from_subprocess_call +from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path def value_model(logger,repo_git): """Runs scc on repo and stores data in database diff --git a/collectoss/tasks/git/scc_value_tasks/tasks.py b/collectoss/tasks/git/scc_value_tasks/tasks.py new file mode 100644 index 000000000..77b94b679 --- /dev/null +++ b/collectoss/tasks/git/scc_value_tasks/tasks.py @@ -0,0 +1,13 @@ +import logging +from collectoss.application.db.lib import get_session +from collectoss.tasks.git.scc_value_tasks.core import * +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import FacadeRepoCollectionTask + + +@celery.task(base=FacadeRepoCollectionTask) +def process_scc_value_metrics(repo_git): + + logger = logging.getLogger(process_scc_value_metrics.__name__) + + value_model(logger,repo_git,) \ No newline at end of file diff --git a/augur/tasks/git/util/__init__.py b/collectoss/tasks/git/util/__init__.py similarity index 100% rename from augur/tasks/git/util/__init__.py rename to collectoss/tasks/git/util/__init__.py diff --git a/augur/tasks/git/util/facade_worker/__init__.py b/collectoss/tasks/git/util/facade_worker/__init__.py similarity index 100% rename from augur/tasks/git/util/facade_worker/__init__.py rename to collectoss/tasks/git/util/facade_worker/__init__.py diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/__init__.py b/collectoss/tasks/git/util/facade_worker/facade_worker/__init__.py new file mode 100644 index 000000000..4b1acc304 --- /dev/null +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/__init__.py @@ -0,0 +1,5 @@ +#SPDX-License-Identifier: MIT +"""collectoss_worker_github - CollectOSS Worker that collects GitHub data""" + +__version__ = '1.3.0' +__all__ = [] diff --git a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py b/collectoss/tasks/git/util/facade_worker/facade_worker/analyzecommit.py similarity index 98% rename from augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/analyzecommit.py index 0426c8372..5da00e440 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/analyzecommit.py @@ -45,8 +45,8 @@ import sqlalchemy as s from typing import Optional, List, Tuple, Dict, Any -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text -from augur.tasks.init import get_rabbitmq_conn_string +from collectoss.application.db.lib import execute_sql, fetchall_data_from_sql_text +from collectoss.tasks.init import get_rabbitmq_conn_string def check_swapped_emails(name: str, email: str) -> Tuple[str, str]: if name.find('@') >= 0 and email.find('@') == -1: @@ -161,7 +161,7 @@ def analyze_commit( ) git_log = check_output( [f"git", "--git-dir", repo_loc, "log", "-p", "-M", commit, "-n1", - f"--pretty=format:'{pretty_format}'"] + f"--pretty=format:{pretty_format}"] ) except Exception as e: logger.error(f"Failed to run git log for commit {commit}: {e}") diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py similarity index 96% rename from augur/tasks/git/util/facade_worker/facade_worker/config.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/config.py index a83d09390..7da6495bd 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py @@ -35,9 +35,9 @@ from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig -from augur.application.db.lib import execute_sql +from collectoss.application.db.session import DatabaseSession +from collectoss.application.config import SystemConfig +from collectoss.application.db.lib import execute_sql from logging import Logger logger = logging.getLogger(__name__) @@ -48,7 +48,7 @@ def get_database_args_from_env(): try: db_json_file_location = os.getcwd() + "/db.config.json" except FileNotFoundError: - logger.error("\n\nPlease run augur commands in the root directory\n\n") + logger.error("\n\nPlease run collectoss commands in the root directory\n\n") sys.exit() db_json_exists = os.path.exists(db_json_file_location) @@ -104,7 +104,7 @@ class FacadeHelper(): """ def __init__(self,logger: Logger): - from augur.application.db import get_engine + from collectoss.application.db import get_engine engine = get_engine() self.repos_processed = 0 # super().__init__(logger=logger, engine=engine) @@ -112,7 +112,7 @@ def __init__(self,logger: Logger): self.logger = logger with DatabaseSession(logger, engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) worker_options = config.get_section("Facade") @@ -131,7 +131,6 @@ def __init__(self,logger: Logger): self.rebuild_caches = worker_options["rebuild_caches"] self.multithreaded = worker_options["multithreaded"] self.create_xlsx_summary_files = worker_options["create_xlsx_summary_files"] - self.facade_contributor_full_recollect = worker_options["facade_contributor_full_recollect"] self.commit_messages = worker_options["commit_messages"] self.tool_source = "Facade" diff --git a/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/.gitignore b/collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/.gitignore similarity index 100% rename from augur/tasks/git/util/facade_worker/facade_worker/excel_generators/.gitignore rename to collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/.gitignore diff --git a/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/ADDING_NEW_REPORTS b/collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/ADDING_NEW_REPORTS similarity index 100% rename from augur/tasks/git/util/facade_worker/facade_worker/excel_generators/ADDING_NEW_REPORTS rename to collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/ADDING_NEW_REPORTS diff --git a/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/__init__.py b/collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/__init__.py similarity index 100% rename from augur/tasks/git/util/facade_worker/facade_worker/excel_generators/__init__.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/__init__.py diff --git a/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py b/collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py similarity index 100% rename from augur/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py diff --git a/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/readme.md b/collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/readme.md similarity index 100% rename from augur/tasks/git/util/facade_worker/facade_worker/excel_generators/readme.md rename to collectoss/tasks/git/util/facade_worker/facade_worker/excel_generators/readme.md diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py b/collectoss/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py similarity index 93% rename from augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py index 1811c734f..845b87dff 100755 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py @@ -32,7 +32,7 @@ #from contributor_interfaceable.facade08contributorinterfaceable import ContributorInterfaceable -from augur.tasks.github.facade_github.contributor_interfaceable.contributor_interface import * +from collectoss.tasks.github.facade_github.contributor_interfaceable.contributor_interface import * diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/collectoss/tasks/git/util/facade_worker/facade_worker/rebuildcache.py similarity index 99% rename from augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index d92f17b69..5d83201ad 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -26,7 +26,7 @@ # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. import sqlalchemy as s -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text +from collectoss.application.db.lib import execute_sql, fetchall_data_from_sql_text from .utilitymethods import store_working_author, trim_author # if platform.python_implementation() == 'PyPy': # import pymysql diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py similarity index 98% rename from augur/tasks/git/util/facade_worker/facade_worker/repofetch.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py index 6e911f6fd..658ddc1d0 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -32,10 +32,10 @@ import sqlalchemy as s from .utilitymethods import update_repo_log, get_absolute_repo_path from sqlalchemy.orm.exc import NoResultFound -from augur.application.db.models.augur_data import * -from augur.application.db.models.augur_operations import CollectionStatus -from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list -from augur.application.db.lib import execute_sql, get_repo_by_repo_git +from collectoss.application.db.models.augur_data import * +from collectoss.application.db.models.augur_operations import CollectionStatus +from collectoss.application.db.util import execute_session_query, convert_orm_list_to_dict_list +from collectoss.application.db.lib import execute_sql, get_repo_by_repo_git class GitCloneError(Exception): pass diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py similarity index 93% rename from augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py rename to collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 92546002a..513390d07 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -29,12 +29,12 @@ from subprocess import check_output, CalledProcessError import os import sqlalchemy as s -from augur.application.db.models import * +from collectoss.application.db.models import * from .config import FacadeHelper as FacadeHelper -from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session -from augur.application.db.util import execute_session_query -#from augur.tasks.git.util.facade_worker.facade +from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps +from collectoss.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session +from collectoss.application.db.util import execute_session_query +#from collectoss.tasks.git.util.facade_worker.facade def update_repo_log(logger, facade_helper, repos_id,status): diff --git a/collectoss/tasks/github/__init__.py b/collectoss/tasks/github/__init__.py new file mode 100644 index 000000000..de3f37bd8 --- /dev/null +++ b/collectoss/tasks/github/__init__.py @@ -0,0 +1,7 @@ +from collectoss.tasks.github.contributors import * +from collectoss.tasks.github.events import * +from collectoss.tasks.github.issues import * +from collectoss.tasks.github.messages import * +from collectoss.tasks.github.pull_requests.tasks import * +from collectoss.tasks.github.repo_info.tasks import * +from collectoss.tasks.github.releases.tasks import * diff --git a/augur/tasks/github/contributors.py b/collectoss/tasks/github/contributors.py similarity index 60% rename from augur/tasks/github/contributors.py rename to collectoss/tasks/github/contributors.py index 20f796647..f3eaaa802 100644 --- a/augur/tasks/github/contributors.py +++ b/collectoss/tasks/github/contributors.py @@ -2,14 +2,17 @@ import logging import traceback -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.tasks.github.util.github_paginator import hit_api -from augur.tasks.github.facade_github.tasks import * -from augur.application.db.models import Contributor -from augur.application.db.util import execute_session_query -from augur.application.db.lib import bulk_insert_dicts, get_session, batch_insert_contributors -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.tasks.github.util.github_paginator import hit_api +from collectoss.tasks.github.facade_github.tasks import * +from collectoss.application.db.models import Contributor +from collectoss.application.db.util import execute_session_query +from collectoss.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor + +from collectoss.application.db.lib import bulk_insert_dicts, get_session, batch_insert_contributors +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +import json @@ -18,7 +21,7 @@ def process_contributors(): logger = logging.getLogger(process_contributors.__name__) - tool_source = "Contributors task" + tool_source = "Process Contributors task" tool_version = "2.0" data_source = "Github API" @@ -26,7 +29,15 @@ def process_contributors(): with get_session() as session: - query = session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) + query = ( + session.query(Contributor) + .filter( + Contributor.data_source == data_source, + Contributor.cntrb_created_at.is_(None), + Contributor.cntrb_last_used.is_(None) + ) + .limit(500) + ) contributors = execute_session_query(query, 'all') contributors_len = len(contributors) @@ -47,16 +58,20 @@ def process_contributors(): url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" - data = retrieve_dict_data(url, key_auth, logger) + try: + data = retrieve_dict_data(url, key_auth, logger) + except json.JSONDecodeError as e: + logger.error(f"Encountered error parsing JSON in call to {url}") + logger.info(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") + continue if data is None: - print(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") + logger.info(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") continue - new_contributor_data = { - "cntrb_created_at": data["created_at"], - "cntrb_last_used": data["updated_at"] - } + + new_contributor_data = extract_github_contributor(data, tool_source, tool_version, data_source) + contributor_dict.update(new_contributor_data) @@ -106,16 +121,20 @@ def retrieve_dict_data(url: str, key_auth, logger): return None -@celery.task(base=AugurCoreRepoCollectionTask, bind=True) +@celery.task(base=CoreRepoCollectionTask, bind=True) def grab_comitters(self, repo_git,platform="github"): + tool_source = "Committers task" + tool_version = "1.0" + data_source = "Github API" + engine = self.app.engine logger = logging.getLogger(grab_comitters.__name__) try: key_auth = GithubRandomKeyAuth(logger) - grab_committer_list(logger, key_auth, repo_git, platform) + grab_committer_list(logger, key_auth, repo_git, tool_source, tool_version, data_source, platform) except Exception as e: logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") diff --git a/augur/tasks/github/detect_move/__init__.py b/collectoss/tasks/github/detect_move/__init__.py similarity index 100% rename from augur/tasks/github/detect_move/__init__.py rename to collectoss/tasks/github/detect_move/__init__.py diff --git a/augur/tasks/github/detect_move/core.py b/collectoss/tasks/github/detect_move/core.py similarity index 81% rename from augur/tasks/github/detect_move/core.py rename to collectoss/tasks/github/detect_move/core.py index a3eb3803d..1c0d7dba8 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/collectoss/tasks/github/detect_move/core.py @@ -1,13 +1,13 @@ -from augur.tasks.github.util.github_task_session import * -from augur.application.db.models import Repo, CollectionStatus -from augur.tasks.github.util.github_paginator import hit_api -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util.util import parse_json_response +from collectoss.tasks.github.util.github_task_session import * +from collectoss.application.db.models import Repo, CollectionStatus +from collectoss.tasks.github.util.github_paginator import hit_api +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.tasks.github.util.util import parse_json_response from datetime import datetime -from augur.tasks.util.collection_state import CollectionState -from augur.application.db.util import execute_session_query -from augur.application.db.lib import bulk_insert_dicts -from augur.application.db.models import HistoricalRepoURLs +from collectoss.tasks.util.collection_state import CollectionState +from collectoss.application.db.util import execute_session_query +from collectoss.application.db.lib import bulk_insert_dicts +from collectoss.application.db.models import HistoricalRepoURLs from sqlalchemy.exc import IntegrityError @@ -59,7 +59,14 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger): response_from_gh = hit_api(key_auth, url, logger) page_data = parse_json_response(logger, response_from_gh) - + # Note: if parse_json_response fails, it will return a string, causing `in` to do a substring check. + # This may lead to a crash if the field is present but the parsing fails. + if response_from_gh.status_code != 200 or 'full_name' not in page_data: + logger.error( + f"Unexpected response fetching redirect target: status={response_from_gh.status_code}, " + f"url={url}, body={response_from_gh.text}" + ) + raise Exception("Could not resolve owner/repo from redirect target due to missing data in the github response") full_repo_name = page_data['full_name'] splits = full_repo_name.split('/') diff --git a/augur/tasks/github/detect_move/tasks.py b/collectoss/tasks/github/detect_move/tasks.py similarity index 72% rename from augur/tasks/github/detect_move/tasks.py rename to collectoss/tasks/github/detect_move/tasks.py index 249ff1a0d..f586bc54f 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/collectoss/tasks/github/detect_move/tasks.py @@ -1,15 +1,15 @@ import logging -from augur.tasks.github.detect_move.core import ping_github_for_repo_move, RepoMovedException, RepoGoneException -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.lib import get_repo_by_repo_git, get_session -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.github.detect_move.core import ping_github_for_repo_move, RepoMovedException, RepoGoneException +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask, SecondaryRepoCollectionTask +from collectoss.application.db.lib import get_repo_by_repo_git, get_session +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from celery.exceptions import Retry, Reject -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def detect_github_repo_move_core(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_core.__name__) @@ -37,7 +37,7 @@ def detect_github_repo_move_core(repo_git : str) -> None: raise Reject(e) -@celery.task(base=AugurSecondaryRepoCollectionTask) +@celery.task(base=SecondaryRepoCollectionTask) def detect_github_repo_move_secondary(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_secondary.__name__) diff --git a/augur/tasks/github/events.py b/collectoss/tasks/github/events.py similarity index 91% rename from augur/tasks/github/events.py rename to collectoss/tasks/github/events.py index 38a5e9e9c..24b1e42ff 100644 --- a/augur/tasks/github/events.py +++ b/collectoss/tasks/github/events.py @@ -5,21 +5,22 @@ from abc import ABC, abstractmethod from datetime import datetime, timedelta, timezone -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor, Repo -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id, get_session, get_engine, get_core_data_last_collected, batch_insert_contributors +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.application.db.data_parse import * +from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.github.util.github_task_session import GithubTaskManifest +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.application.db.models import PullRequestEvent, IssueEvent, Contributor, Repo +from collectoss.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id, get_session, get_engine, get_core_data_last_collected, batch_insert_contributors, get_batch_size + platform_id = 1 -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_events(repo_git: str, full_collection: bool): logger = logging.getLogger(collect_events.__name__) @@ -115,12 +116,14 @@ def collect(self, repo_git, key_auth, since): owner, repo = get_owner_repo(repo_git) self.repo_identifier = f"{owner}/{repo}" + event_batch_size = get_batch_size("event") + events = [] for event in self._collect_events(repo_git, key_auth, since): events.append(event) # making this a decent size since process_events retrieves all the issues and prs each time - if len(events) >= 500: + if len(events) >= event_batch_size: self._process_events(events, repo_id) events.clear() @@ -277,6 +280,8 @@ def collect(self, repo_git, key_auth, since): def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, since): + event_batch_size = get_batch_size("event") + engine = get_engine() with engine.connect() as connection: @@ -327,19 +332,21 @@ def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, sinc except UrlNotFoundException as e: self._logger.info(f"{self.repo_identifier}: Issue with number of {issue_number} returned 404 on event data. Skipping.") - if len(events) > 500: + if len(events) >= event_batch_size: self._insert_contributors(contributors) self._insert_issue_events(events) events.clear() - + if events: self._insert_contributors(contributors) self._insert_issue_events(events) events.clear() - + def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since): + event_batch_size = get_batch_size("event") + engine = get_engine() with engine.connect() as connection: @@ -389,7 +396,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since): self._logger.info(f"{self.repo_identifier}: PR with number of {pr_number} returned 404 on event data. Skipping.") continue - if len(events) > 500: + if len(events) >= event_batch_size: self._insert_contributors(contributors) self._insert_pr_events(events) events.clear() diff --git a/augur/tasks/github/facade_github/__init__.py b/collectoss/tasks/github/facade_github/__init__.py similarity index 100% rename from augur/tasks/github/facade_github/__init__.py rename to collectoss/tasks/github/facade_github/__init__.py diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/__init__.py b/collectoss/tasks/github/facade_github/contributor_interfaceable/__init__.py similarity index 100% rename from augur/tasks/github/facade_github/contributor_interfaceable/__init__.py rename to collectoss/tasks/github/facade_github/contributor_interfaceable/__init__.py diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py similarity index 72% rename from augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py rename to collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 33f5ca83d..1e064f033 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -1,13 +1,13 @@ -from augur.tasks.github.util.github_task_session import * +from collectoss.tasks.github.util.github_task_session import * import json import time import sqlalchemy as s -from augur.application.db.models import * -from augur.tasks.github.util.github_paginator import hit_api, process_dict_response -from augur.tasks.github.util.github_data_access import GithubDataAccess +from collectoss.application.db.models import * +from collectoss.tasks.github.util.github_paginator import hit_api, process_dict_response +from collectoss.tasks.github.util.github_data_access import GithubDataAccess # Debugger -from augur.tasks.github.util.github_paginator import GithubApiResult -from augur.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id +from collectoss.tasks.github.util.github_paginator import GithubApiResult +from collectoss.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -16,7 +16,7 @@ #'sqla+postgresql://scott:tiger@localhost/mydatabase' """ -A few interesting ideas: Maybe get the top committers from each repo first? curl https://api.github.com/repos/chaoss/augur/contributors +A few interesting ideas: Maybe get the top committers from each repo first? curl https://api.github.com/repos/chaoss/collectoss/contributors """ @@ -27,13 +27,18 @@ def clean_dict(d): return {k: ("" if v is None else v) for k, v in d.items()} -# Hit the endpoint specified by the url and return the json that it returns if it returns a dict. -# Returns None on failure. -# NOTE: This function is being deprecated in favor of GithubDataAcess.get_resource() -# No functional change in this function body; logic preserved fully. -# This function still attempts to hit a GitHub API endpoint and return a dictionary if successful. - +# deprecated in favor of GithubDataAcess.get_resource() def request_dict_from_endpoint(logger, session, url, timeout_wait=10): + """Hit the endpoint specified by the url and return the json that it returns if it returns a dict. + + NOTE: This function is being deprecated in favor of GithubDataAcess.get_resource() + No functional change in this function body; logic preserved fully. + This function still attempts to hit a GitHub API endpoint and return a dictionary if successful. + + + Returns: + None on failure. Dict on success + """ attempts = 0 response_data = None success = False @@ -117,7 +122,7 @@ def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): logger.debug( f"Trying to create endpoint from commit hash: {commit_sha}") - # https://api.github.com/repos/chaoss/augur/commits/53b0cc122ac9ecc1588d76759dc2e8e437f45b48 + # https://api.github.com/repos/chaoss/collectoss/commits/5a1f7da239555a102fc18065720ebad516691bbc #stmnt = s.select(Repo.repo_path, Repo.repo_name).where(Repo.repo_id == repo_id) @@ -138,28 +143,17 @@ def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): return url -# Try to construct the best url to ping GitHub's API for a username given a full name. -def create_endpoint_from_name(contributor): - # Try to get the 'names' field if 'commit_name' field is not present in contributor data. - name_field = 'cmt_author_name' if 'commit_name' in contributor else 'name' - - # Deal with case where name is one word or none. - if len(contributor[name_field].split()) < 2: - raise ValueError - cmt_cntrb = { - 'fname': contributor[name_field].split()[0], - # Pythonic way to get the end of a list so that we truely get the last name. - 'lname': contributor[name_field].split()[-1] - } - url = 'https://api.github.com/search/users?q=fullname:{}+{}'.format( - cmt_cntrb['fname'], cmt_cntrb['lname']) - - return url - def insert_alias(logger, contributor, email): - # Insert cntrb_id and email of the corresponding record into the alias table - # Another database call to get the contributor id is needed because it's an autokeyincrement accessed by multiple workers - # Same principle as enrich_cntrb_id method. + """Insert cntrb_id and email of the corresponding record into the alias table. + Another database call to get the contributor id is needed because it's an + autokeyincrement accessed by multiple workers. Same principle as enrich_cntrb_id method. + Args: + logger: Logger instance. + contributor (dict): Contributor record containing 'gh_user_id' and optionally 'cntrb_canonical'. + email (str): Email address to create the alias for. + Raises: + LookupError: If the contributor cannot be found in the database by gh_user_id. + """ contributor_table_data = get_contributors_by_github_user_id(contributor["gh_user_id"]) @@ -168,7 +162,7 @@ def insert_alias(logger, contributor, email): logger.debug( f"cntrb_id {contributor_table_data[0].cntrb_id} found in database and assigned to enriched data") elif len(contributor_table_data) == 0: - logger.error("Couldn't find contributor in database. Something has gone very wrong. Augur ran into a contributor whose login can be found in the contributor's table, but cannot be retrieved via the user_id that was gotten using the same login.") + logger.error("Couldn't find contributor in database. Something has gone very wrong. CollectOSS ran into a contributor whose login can be found in the contributor's table, but cannot be retrieved via the user_id that was gotten using the same login.") raise LookupError else: logger.warning( @@ -192,14 +186,17 @@ def insert_alias(logger, contributor, email): alias_clean = clean_dict(alias) # Insert new alias - bulk_insert_dicts(logger, alias_clean, ContributorsAlias, ['alias_email']) + bulk_insert_dicts(logger, alias_clean, ContributorsAlias, ['cntrb_id','alias_email']) return -# Takes the user data from the endpoint as arg -# Updates the alias table if the login is already in the contributor's table with the new email. -# Returns whether the login was found in the contributors table -def resolve_if_login_existing(logger, contributor): +def resolve_if_login_existing(logger, contributor) -> bool: + """Takes the user data from the endpoint as arg. + Updates the alias table if the login is already in the contributor's table with the new email. + + Returns: + bool: Whether the login was found in the contributors table. + """ # check if login exists in contributors table select_cntrbs_query = s.sql.text(""" SELECT cntrb_id from contributors @@ -262,13 +259,23 @@ def update_contributor(self, cntrb, max_attempts=3): attempts += 1 """ -# Try every distinct email found within a commit for possible username resolution. -# Add email to garbage table if can't be resolved. -# \param contributor is the raw database entry -# \return A dictionary of response data from github with potential logins on success. -# None on failure -def fetch_username_from_email(logger, auth, commit): + +def fetch_username_from_email(logger, auth, commit) -> dict | None: + """Try every distinct email found within a commit for possible username resolution. + Add email to garbage table if can't be resolved. + + # \param contributor is the raw database entry + + Args: + logger (_type_): logging instance to use + auth (_type_): Keymanager instance to use for auth + commit (_type_): _description_ + + Returns: + dict: A dictionary of response data from github with potential logins on success. +# None on failure + """ # Default to failed state login_json = None @@ -306,47 +313,29 @@ def fetch_username_from_email(logger, auth, commit): return login_json -# Method to return the login given commit data using the supplemental data in the commit -# -email -# -name + def get_login_with_supplemental_data(logger, auth, commit_data): + """Return the login given commit data using the supplemental data in the commit (email). + + Args: + logger: Logger instance. + auth: GithubRandomKeyAuth instance for API authentication. + commit_data (dict): Commit record containing 'email_raw'. + + Returns: + str: GitHub login username on success. None if unresolved. + """ # Try to get login from all possible emails # Is None upon failure. login_json = fetch_username_from_email(logger,auth,commit_data) - # Check if the email result got anything, if it failed, place in unresolved and try a name search. - if login_json is None or 'total_count' not in login_json or login_json['total_count'] == 0: - - unresolved = { - "email": commit_data['email_raw'], - "name": commit_data['name'], - } - logger.debug(f"Inserting data to unresolved: {unresolved}") - - try: - - unresolved_natural_keys = ['email'] - bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) - except Exception as e: - logger.error( - f"Could not create new unresolved email {unresolved['email']}. Error: {e}") - - logger.warning( - "Could not resolve the username from the email. Trying a name only search...") - try: - url = create_endpoint_from_name(commit_data) - except Exception as e: - logger.warning( - f"Couldn't resolve name url with given data. Reason: {e}") - return None - - login_json = GithubDataAccess(auth, logger, "search").get_resource(url) - # total_count is the count of username's found by the endpoint. + # This Checks if the email result got anything. + # If it fails, dont place it in unresolved yet as we may add more steps later if login_json is None or 'total_count' not in login_json: logger.error( - "Search query returned an empty response, moving on...\n") + "Search query returned an empty response. Could not resolve the username from the email. Moving on...\n") return None if login_json['total_count'] == 0: logger.error( diff --git a/collectoss/tasks/github/facade_github/core.py b/collectoss/tasks/github/facade_github/core.py new file mode 100644 index 000000000..64b42e0d2 --- /dev/null +++ b/collectoss/tasks/github/facade_github/core.py @@ -0,0 +1,83 @@ +from collectoss.tasks.github.facade_github.contributor_interfaceable.contributor_interface import * +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.tasks.github.util.github_task_session import * +from collectoss.application.db.models import * +from collectoss.tasks.util.ContributorUUID import GithubUUID +from collectoss.application.db.lib import bulk_insert_dicts, batch_insert_contributors +from collectoss.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor +from collectoss.tasks.github.util.github_data_access import GithubDataAccess + + + + +def query_github_contributors(logger, key_auth, github_url, tool_source:str, tool_version:str, data_source:str): + + """ Data collection function + Query the GitHub API for contributors + """ + + # Set platform id to 1 since it is a github method + platform_id = 1 + + # Extract owner/repo from the url for the endpoint + try: + owner, name = get_owner_repo(github_url) + except IndexError as e: + logger.error(f"Encountered bad url: {github_url}") + raise e + + # Set the base of the url and place to hold contributors to insert + contributors_url = ( + f"https://api.github.com/repos/{owner}/{name}/" + + "contributors?state=all" + ) + + # Get contributors that we already have stored + # Set our duplicate and update column map keys (something other than PK) to + # check dupicates/needed column updates with + table = 'contributors' + table_pkey = 'cntrb_id' + update_col_map = {'cntrb_email': 'email'} + duplicate_col_map = {'cntrb_login': 'login'} + + github_data_access = GithubDataAccess(key_auth, logger) + + contributor_count = github_data_access.get_resource_count(contributors_url) + + logger.info("Count of contributors needing insertion: " + str(contributor_count) + "\n") + + if contributor_count == 0: + return + + for repo_contributor in github_data_access.paginate_resource(contributors_url): + try: + # Need to hit this single contributor endpoint to get extra data including... + # `created at` + # i think that's it + cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) + + + logger.info("Hitting endpoint: " + cntrb_url + " ...\n") + #r = hit_api(session.oauths, cntrb_url, logger) + #contributor = r.json() + + contributor = github_data_access.get_resource(cntrb_url) + + cntrb = extract_github_contributor(contributor, tool_source, tool_version, data_source) + + #insert cntrb to table. + #session.logger.info(f"Contributor: {cntrb} \n") + batch_insert_contributors(logger, [cntrb]) + + except Exception as e: + logger.error("Caught exception: {}".format(e)) + logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + raise e + +# Get all the committer data for a repo. +# Used by facade in facade03analyzecommit +def grab_committer_list(logger, key_auth, repo_git, tool_source: str, tool_version: str, data_source: str, platform="github" ): + + # Create API endpoint from repo_id + query_github_contributors(logger, key_auth, repo_git, tool_source, tool_version, data_source) + \ No newline at end of file diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py new file mode 100644 index 000000000..ab7a18eab --- /dev/null +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -0,0 +1,311 @@ +import logging + + +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import FacadeRepoCollectionTask +from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.github.facade_github.core import * +from collectoss.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_email, get_repo_by_repo_git, batch_insert_contributors, get_batch_size +from collectoss.application.db.lib import get_session, execute_session_query +from collectoss.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * +from collectoss.application.db.lib import bulk_insert_dicts +from collectoss.application.db.data_parse import extract_needed_contributor_data as extract_github_contributor + + + + +def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id, tool_source:str, tool_version:str, data_source:str): + + github_data_access = GithubDataAccess(auth, logger) + + for contributor in contributorQueue: + # Get the email from the commit data + email = contributor['email_raw'] if 'email_raw' in contributor else contributor['email'] + + name = contributor['name'] + + # check the email to see if it already exists in contributor_aliases + + # Look up email to see if resolved + alias_table_data = get_contributor_aliases_by_email(email) + if len(alias_table_data) >= 1: + # Move on if email resolved + logger.debug( + f"Email {email} has been resolved earlier.") + + continue + + #Check the unresolved_commits table to avoid hitting endpoints that we know don't have relevant data needlessly + + + unresolved_query_result = get_unresolved_commit_emails_by_email(email) + + if len(unresolved_query_result) >= 1: + + logger.debug(f"Commit data with email {email} has been unresolved in the past, skipping...") + continue + + login = None + + #Check the contributors table for a login for the given name + # This is being removed because anyone with a common name (i.e. dave, adam) who only puts + # their first name or nickname on their profile is getting grouped with EVERYONE else who is doing that. + # AE + + # contributors_with_matching_name = TODO + + # if not contributors_with_matching_name or len(contributors_with_matching_name) > 1: + # logger.debug("Failed local login lookup") + # else: + # login = contributors_with_matching_name[0].gh_login + + + # Try to get the login from the commit sha + if login == None or login == "": + login = get_login_with_commit_hash(logger, auth, contributor, repo_id) + + if login == None or login == "": + logger.warning("Failed to get login from commit hash") + # Try to get the login from supplemental data if not found with the commit hash + login = get_login_with_supplemental_data(logger, auth,contributor) + + if login == None or login == "": + logger.error("Failed to get login from supplemental data!") + + unresolved = { + "email": email, + "name": name, + } + logger.debug(f"No more username resolution methods available. Inserting data into unresolved table: {unresolved}") + + try: + unresolved_natural_keys = ['email'] + bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + except Exception as e: + logger.error( + f"Could not create new unresolved email {email}. Error: {e}") + # move on to the next contributor + continue + + url = ("https://api.github.com/users/" + login) + + try: + user_data = github_data_access.get_resource(url) + except UrlNotFoundException as e: + logger.warning(f"User of {login} not found on github. Skipping...") + continue + + # Use the email found in the commit data if api data is NULL + emailFromCommitData = contributor['email_raw'] if 'email_raw' in contributor else contributor['email'] + + + # Get name from commit if not found by GitHub + name_field = contributor['commit_name'] if 'commit_name' in contributor else contributor['name'] + + cntrb = extract_github_contributor(user_data, tool_source, tool_version, data_source) + if cntrb is None: + continue + + # extra processing unique to facade based contributor collection + if not cntrb.get('cntrb_canonical'): + cntrb['cntrb_canonical'] = emailFromCommitData + if not cntrb.get('cntrb_email'): + cntrb['cntrb_email'] = emailFromCommitData + + if not cntrb.get('cntrb_full_name'): + cntrb['cntrb_full_name'] = name_field + + + #Executes an upsert with sqlalchemy + cntrb_natural_keys = ['cntrb_id'] + batch_insert_contributors(logger, [cntrb]) + + try: + # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey + insert_alias(logger, cntrb, emailFromCommitData) + except LookupError as e: + logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) + logger.error( + f"Contributor id not able to be found in database despite the user_id existing. Something very wrong is happening. Error: {e}") + return + + + #Replace each instance of a single or double quote with escape characters + #for postgres + escapedEmail = email.replace('"',r'\"') + escapedEmail = escapedEmail.replace("'",r'\'') + # Resolve any unresolved emails if we get to this point. + # They will get added to the alias table later + # Do this last to absolutely make sure that the email was resolved before we remove it from the unresolved table. + query = s.sql.text(""" + DELETE FROM unresolved_commit_emails + WHERE email='{}' + """.format(escapedEmail)) + + logger.debug(f"Updating now resolved email {email}") + + try: + execute_sql(query) + except Exception as e: + logger.error( + f"Deleting now resolved email failed with error: {e}") + raise e + + + return + + +def link_commits_to_contributor(logger, facade_helper, contributorQueue): + + # # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id. + for cntrb in contributorQueue: + logger.debug( + f"These are the emails and cntrb_id's returned: {cntrb}") + + query = s.sql.text(""" + UPDATE commits + SET cmt_ght_author_id=:cntrb_id + WHERE + (cmt_author_raw_email=:cntrb_email + OR cmt_author_email=:cntrb_email) + AND cmt_ght_author_id is NULL + """).bindparams(cntrb_id=cntrb["cntrb_id"],cntrb_email=cntrb["email"]) + + #engine.execute(query, **data) + facade_helper.insert_or_update_data(query) + + + return + + +# Update the contributors table from the data facade has gathered. +@celery.task(base=FacadeRepoCollectionTask, bind=True) +def insert_facade_contributors(self, repo_git): + + tool_source = "Insert Contributors task" + tool_version = "2.0" + data_source = "Github API" + + # Set platform id to 1 since this task is github specific + platform_id = 1 + + logger = logging.getLogger(insert_facade_contributors.__name__) + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + facade_helper = FacadeHelper(logger) + + # Find commits not yet linked to a contributor (cmt_ght_author_id IS NULL), + # skipping emails already marked unresolvable. + + logger.info( + "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) + new_contrib_sql = s.sql.text(""" + SELECT DISTINCT + commits.cmt_author_name AS NAME, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw + FROM + augur_data.commits + WHERE + commits.repo_id = :repo_id AND + commits.cmt_ght_author_id IS NULL AND + commits.cmt_author_raw_email NOT IN ( + SELECT email FROM augur_data.unresolved_commit_emails + ) + """).bindparams(repo_id=repo_id) + + #Execute statement with session. + result = execute_sql(new_contrib_sql) + + # Fetch all results immediately to close the database cursor/connection + # This prevents holding the connection open during GitHub API calls + rows = result.mappings().fetchall() + + #print(new_contribs) + + #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ + # 'repo_id': repo_id}).to_json(orient="records")) + + + key_auth = GithubRandomKeyAuth(logger) + + facade_batch_size = get_batch_size() + + # Process results in batches to reduce memory usage + batch = [] + + for row in rows: + batch.append(dict(row)) + + if len(batch) >= facade_batch_size: + process_commit_metadata(logger, key_auth, batch, repo_id, platform_id, tool_source, tool_version, data_source) + batch.clear() + + # Process remaining items in batch + if batch: + process_commit_metadata(logger, key_auth, batch, repo_id, platform_id, tool_source, tool_version, data_source) + + logger.debug("DEBUG: Got through the new_contribs") + + # Match unlinked commits to contributors via email from any source (cntrb_email, canonical email, or alias). + resolve_email_to_cntrb_id_sql = s.sql.text(""" + WITH email_to_contributor AS ( + SELECT cntrb_email AS email, cntrb_id + FROM augur_data.contributors + WHERE cntrb_email IS NOT NULL + + UNION ALL + + SELECT cntrb_canonical AS email, cntrb_id + FROM augur_data.contributors + WHERE cntrb_canonical IS NOT NULL + + UNION ALL + + SELECT alias_email AS email, cntrb_id + FROM augur_data.contributors_aliases + WHERE alias_email IS NOT NULL + ), + deduplicated AS ( + SELECT DISTINCT ON (email) email, cntrb_id + FROM email_to_contributor + ORDER BY email + ) + SELECT + d.cntrb_id, + c.cmt_author_email AS email + FROM + augur_data.commits c + INNER JOIN + deduplicated d + ON c.cmt_author_email = d.email + WHERE + c.cmt_ght_author_id IS NULL AND + c.repo_id = :repo_id + """).bindparams(repo_id=repo_id) + + + result = execute_sql(resolve_email_to_cntrb_id_sql) + + # Fetch all results immediately to close the database cursor/connection + # This prevents holding the connection open during database UPDATE operations + rows = result.mappings().fetchall() + + # Process results in batches to reduce memory usage + batch = [] + + for row in rows: + batch.append(dict(row)) + + if len(batch) >= facade_batch_size: + link_commits_to_contributor(logger, facade_helper, batch) + batch.clear() + + # Process remaining items in batch + if batch: + link_commits_to_contributor(logger, facade_helper, batch) + + return + diff --git a/augur/tasks/github/issues.py b/collectoss/tasks/github/issues.py similarity index 90% rename from augur/tasks/github/issues.py rename to collectoss/tasks/github/issues.py index 91e56deaf..406718759 100644 --- a/augur/tasks/github/issues.py +++ b/collectoss/tasks/github/issues.py @@ -5,21 +5,22 @@ from sqlalchemy.exc import IntegrityError -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee -from augur.application.config import get_development_flag -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_core_data_last_collected, batch_insert_contributors +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.application.db.data_parse import * +from collectoss.tasks.github.util.github_data_access import GithubDataAccess +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.application.db.models import Issue, IssueLabel, IssueAssignee +from collectoss.application.config import get_development_flag +from collectoss.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_core_data_last_collected, batch_insert_contributors, get_batch_size + development = get_development_flag() -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_issues(repo_git: str, full_collection: bool) -> int: """ Collect all issues (excluding pull requests) for a repository. @@ -28,7 +29,7 @@ def collect_issues(repo_git: str, full_collection: bool) -> int: related labels, assignees, and contributors. Args: - repo_git: Full git URL (e.g., 'https://github.com/chaoss/augur') + repo_git: Full git URL (e.g., 'https://github.com/chaoss/collectoss') full_collection: True for all historical data, False for incremental (last collection - 2 days) Returns: @@ -53,15 +54,16 @@ def collect_issues(repo_git: str, full_collection: bool) -> int: try: issue_data_generator = retrieve_all_issue_data(repo_git, logger, key_auth, core_data_last_collected) + issue_batch_size = get_batch_size() + # Process issues in batches to avoid memory spikes batch = [] total_issues = 0 - batch_size = 1000 for issue in issue_data_generator: batch.append(issue) - if len(batch) >= batch_size: + if len(batch) >= issue_batch_size: logger.info(f"{owner}/{repo}: Processing batch of {len(batch)} issues (total so far: {total_issues + len(batch)})") process_issues(batch, f"{owner}/{repo}: Issue task", repo_id, logger) total_issues += len(batch) diff --git a/augur/tasks/github/messages.py b/collectoss/tasks/github/messages.py similarity index 76% rename from augur/tasks/github/messages.py rename to collectoss/tasks/github/messages.py index 812af0fad..342eeb2ca 100644 --- a/augur/tasks/github/messages.py +++ b/collectoss/tasks/github/messages.py @@ -1,30 +1,32 @@ import logging from datetime import timedelta, timezone -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus -from augur.application.db import get_engine, get_session -from augur.application.db.lib import get_core_data_last_collected +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.application.db.data_parse import * +from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from collectoss.tasks.github.util.github_task_session import GithubTaskManifest +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus +from collectoss.application.db import get_engine, get_session +from collectoss.application.db.lib import get_core_data_last_collected, get_batch_size from sqlalchemy.sql import text + + platform_id = 1 -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_github_messages(repo_git: str, full_collection: bool) -> None: logger = logging.getLogger(collect_github_messages.__name__) with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + db_session = manifest.db_session - repo_id = augur_db.session.query(Repo).filter( + repo_id = db_session.session.query(Repo).filter( Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) @@ -41,13 +43,13 @@ def collect_github_messages(repo_git: str, full_collection: bool) -> None: message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name, core_data_last_collected) if message_data: - process_messages(message_data, task_name, repo_id, logger, augur_db) + process_messages(message_data, task_name, repo_id, logger, db_session) else: logger.info(f"{owner}/{repo} has no messages") else: - process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db, core_data_last_collected) + process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, db_session, core_data_last_collected) def is_repo_small(repo_id): @@ -80,7 +82,9 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas return list(github_data_access.paginate_resource(url)) -def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db, since) -> None: +def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, db_session, since) -> None: + + message_batch_size = get_batch_size("message") owner, repo = get_owner_repo(repo_git) @@ -93,21 +97,21 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger if since: query = text(f""" - (select pr_comments_url from pull_requests WHERE repo_id={repo_id} AND pr_updated_at > timestamptz(timestamp '{since}') order by pr_created_at desc) + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} AND pr_comments_url IS NOT NULL AND pr_updated_at > timestamptz(timestamp '{since}') order by pr_created_at desc) UNION - (select comments_url as comment_url from issues WHERE repo_id={repo_id} AND updated_at > timestamptz(timestamp '{since}') order by created_at desc); + (select comments_url as comment_url from issues WHERE repo_id={repo_id} AND comments_url IS NOT NULL AND updated_at > timestamptz(timestamp '{since}') order by created_at desc); """) else: query = text(f""" - (select pr_comments_url from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} AND pr_comments_url IS NOT NULL order by pr_created_at desc) UNION - (select comments_url as comment_url from issues WHERE repo_id={repo_id} order by created_at desc); + (select comments_url as comment_url from issues WHERE repo_id={repo_id} AND comments_url IS NOT NULL order by created_at desc); """) result = connection.execute(query).fetchall() - comment_urls = [x[0] for x in result] + comment_urls = [x[0] for x in result if x[0] is not None] github_data_access = GithubDataAccess(key_auth, logger) @@ -123,20 +127,19 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger except UrlNotFoundException: logger.info(f"{task_name}: PR or issue comment url of {comment_url} returned 404. Skipping.") skipped_urls += 1 - - if len(all_data) >= 20: - process_messages(all_data, task_name, repo_id, logger, augur_db) + + if len(all_data) >= message_batch_size: + process_messages(all_data, task_name, repo_id, logger, db_session) all_data.clear() if len(all_data) > 0: - process_messages(all_data, task_name, repo_id, logger, augur_db) + process_messages(all_data, task_name, repo_id, logger, db_session) logger.info(f"{task_name}: Finished. Skipped {skipped_urls} comment URLs due to 404.") -def process_messages(messages, task_name, repo_id, logger, augur_db): +def process_messages(messages, task_name, repo_id, logger, db_session): - tool_source = "Pr comment task" tool_version = "2.0" data_source = "Github API" @@ -153,13 +156,13 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): # create mapping from issue url to issue id of current issues issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = db_session.session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_url_to_id_map[issue.issue_url] = issue.issue_id # create mapping from pr url to pr id of current pull requests pr_issue_url_to_id_map = {} - prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + prs = db_session.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for pr in prs: pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id @@ -175,6 +178,12 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): related_pr_or_issue_found = False + # determine whether this is an issue or PR message so we can set the correct tool_source in metadata + if is_issue_message(message["html_url"]): + tool_source = "Issue comment task" + else: + tool_source = "Pr comment task" + # this adds the cntrb_id to the message data # the returned contributor will be added to the contributors list later, if the related issue or pr are found # this logic is used so we don't insert a contributor when the related message isn't inserted @@ -230,13 +239,13 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + db_session.insert_data(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = db_session.insert_data(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) if message_return_data is None: return @@ -245,12 +254,12 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): issue_message_ref_dicts = [] for data in message_return_data: - augur_msg_id = data["msg_id"] + msg_id = data["msg_id"] platform_message_id = data["platform_msg_id"] ref = message_ref_mapping_data[platform_message_id] message_ref_data = ref["msg_ref_data"] - message_ref_data["msg_id"] = augur_msg_id + message_ref_data["msg_id"] = msg_id if ref["is_issue"] is True: issue_message_ref_dicts.append(message_ref_data) @@ -259,11 +268,11 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(pr_message_ref_dicts)} pr messages ref rows") pr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - augur_db.insert_data(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) + db_session.insert_data(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + db_session.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) logger.info(f"{task_name}: Inserted {len(message_dicts)} messages. {len(issue_message_ref_dicts)} from issues and {len(pr_message_ref_dicts)} from prs") diff --git a/augur/tasks/github/pull_requests/__init__.py b/collectoss/tasks/github/pull_requests/__init__.py similarity index 100% rename from augur/tasks/github/pull_requests/__init__.py rename to collectoss/tasks/github/pull_requests/__init__.py diff --git a/augur/tasks/github/pull_requests/commits_model/__init__.py b/collectoss/tasks/github/pull_requests/commits_model/__init__.py similarity index 100% rename from augur/tasks/github/pull_requests/commits_model/__init__.py rename to collectoss/tasks/github/pull_requests/commits_model/__init__.py diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/collectoss/tasks/github/pull_requests/commits_model/core.py similarity index 72% rename from augur/tasks/github/pull_requests/commits_model/core.py rename to collectoss/tasks/github/pull_requests/commits_model/core.py index 83b283bb6..9abadc2dd 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/collectoss/tasks/github/pull_requests/commits_model/core.py @@ -1,13 +1,17 @@ import sqlalchemy as s -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException ## URLNotFoundException added to deal with percolation of 404 errors when the commits are not anywhere for a PR already captured. -from augur.application.db.models import * -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs +from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException ## URLNotFoundException added to deal with percolation of 404 errors when the commits are not anywhere for a PR already captured. +from collectoss.application.db.models import * +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.application.db.util import execute_session_query +from collectoss.application.db.lib import get_secondary_data_last_collected, get_updated_prs, get_batch_size -def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collection=False): - + + +def pull_request_commits_model(repo_id,logger, db_session, key_auth, full_collection=False): + + pr_commit_batch_size = get_batch_size() + if full_collection: # query existing PRs and the respective url we will append the commits url to pr_url_sql = s.sql.text(""" @@ -18,7 +22,7 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collecti pr_urls = [] #pd.read_sql(pr_number_sql, self.db, params={}) - pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() + pr_urls = db_session.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() else: last_collected = get_secondary_data_last_collected(repo_id).date() @@ -33,7 +37,7 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collecti }) - query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) + query = db_session.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') owner, name = get_owner_repo(repo.repo_git) @@ -44,7 +48,6 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collecti github_data_access = GithubDataAccess(key_auth, logger) - BATCH_SIZE = 1000 pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] all_data = [] for index,pr_info in enumerate(pr_urls): @@ -73,9 +76,9 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collecti } all_data.append(pr_commit_row) - if len(all_data) >= BATCH_SIZE: + if len(all_data) >= pr_commit_batch_size: logger.info(f"{task_name}: Inserting {len(all_data)} rows") - augur_db.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) + db_session.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) all_data.clear() except UrlNotFoundException: logger.info(f"{task_name}: PR with url of {pr_info['pr_url']} returned 404 on commit data. Skipping.") @@ -83,7 +86,7 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collecti if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") - augur_db.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) + db_session.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) diff --git a/collectoss/tasks/github/pull_requests/commits_model/tasks.py b/collectoss/tasks/github/pull_requests/commits_model/tasks.py new file mode 100644 index 000000000..ab96d3eb5 --- /dev/null +++ b/collectoss/tasks/github/pull_requests/commits_model/tasks.py @@ -0,0 +1,19 @@ +import logging +from collectoss.tasks.github.pull_requests.commits_model.core import * +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import SecondaryRepoCollectionTask +from collectoss.tasks.github.util.github_task_session import GithubTaskManifest +from collectoss.application.db.lib import get_repo_by_repo_git + + + +@celery.task(base=SecondaryRepoCollectionTask) +def process_pull_request_commits(repo_git: str, full_collection: bool) -> None: + + logger = logging.getLogger(process_pull_request_commits.__name__) + + repo = get_repo_by_repo_git(repo_git) + + with GithubTaskManifest(logger) as manifest: + + pull_request_commits_model(repo.repo_id, logger, manifest.db_session, manifest.key_auth, full_collection) diff --git a/augur/tasks/github/pull_requests/core.py b/collectoss/tasks/github/pull_requests/core.py similarity index 96% rename from augur/tasks/github/pull_requests/core.py rename to collectoss/tasks/github/pull_requests/core.py index dd63edab6..7ad2183e5 100644 --- a/augur/tasks/github/pull_requests/core.py +++ b/collectoss/tasks/github/pull_requests/core.py @@ -2,12 +2,12 @@ from typing import Dict, List, Tuple, Optional -from augur.application.db.data_parse import * -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import bulk_insert_dicts, batch_insert_contributors -from augur.tasks.github.util.util import add_key_value_pair_to_dicts -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, Contributor +from collectoss.application.db.data_parse import * +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db.lib import bulk_insert_dicts, batch_insert_contributors +from collectoss.tasks.github.util.util import add_key_value_pair_to_dicts +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.application.db.models import PullRequest, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, Contributor PLATFORM_ID = 1 diff --git a/augur/tasks/github/pull_requests/files_model/__init__.py b/collectoss/tasks/github/pull_requests/files_model/__init__.py similarity index 100% rename from augur/tasks/github/pull_requests/files_model/__init__.py rename to collectoss/tasks/github/pull_requests/files_model/__init__.py diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/collectoss/tasks/github/pull_requests/files_model/core.py similarity index 80% rename from augur/tasks/github/pull_requests/files_model/core.py rename to collectoss/tasks/github/pull_requests/files_model/core.py index 60222a3bc..d07d43246 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/collectoss/tasks/github/pull_requests/files_model/core.py @@ -1,13 +1,17 @@ import sqlalchemy as s -from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess, NotFoundException, InvalidDataException -from augur.application.db.models import * -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs +from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess, NotFoundException, InvalidDataException +from collectoss.application.db.models import * +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.application.db.util import execute_session_query +from collectoss.application.db.lib import get_secondary_data_last_collected, get_updated_prs, get_batch_size -def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection=False): - + + +def pull_request_files_model(repo_id,logger, db_session, key_auth, full_collection=False): + + pr_file_batch_size = get_batch_size() + if full_collection: # query existing PRs and the respective url we will append the commits url to pr_number_sql = s.sql.text(""" @@ -18,7 +22,7 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) - result = augur_db.execute_sql(pr_number_sql)#.fetchall() + result = db_session.execute_sql(pr_number_sql)#.fetchall() pr_numbers = [dict(row) for row in result.mappings()] else: @@ -32,7 +36,7 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection 'pull_request_id': pr.pull_request_id }) - query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) + query = db_session.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') owner, name = get_owner_repo(repo.repo_git) @@ -40,7 +44,6 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) - BATCH_SIZE = 1000 pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] pr_file_rows = [] logger.info(f"Getting pull request files for repo: {repo.repo_git}") @@ -95,9 +98,9 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection pr_file_rows.append(data) - if len(pr_file_rows) >= BATCH_SIZE: + if len(pr_file_rows) >= pr_file_batch_size: logger.info(f"{task_name}: Inserting {len(pr_file_rows)} rows") - augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) + db_session.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) pr_file_rows.clear() except NotFoundException as e: logger.info(f"{task_name}: PR with number of {pr_info['pr_src_number']} returned 404 on file data. Skipping.") @@ -109,4 +112,4 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection if len(pr_file_rows) > 0: logger.info(f"{task_name}: Inserting {len(pr_file_rows)} rows") - augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) + db_session.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) diff --git a/collectoss/tasks/github/pull_requests/files_model/tasks.py b/collectoss/tasks/github/pull_requests/files_model/tasks.py new file mode 100644 index 000000000..7caf27d1d --- /dev/null +++ b/collectoss/tasks/github/pull_requests/files_model/tasks.py @@ -0,0 +1,18 @@ +import logging +from collectoss.tasks.github.pull_requests.files_model.core import * +from collectoss.tasks.github.util.github_task_session import GithubTaskManifest +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import SecondaryRepoCollectionTask +from collectoss.application.db.util import execute_session_query + +@celery.task(base=SecondaryRepoCollectionTask) +def process_pull_request_files(repo_git: str, full_collection: bool) -> None: + + logger = logging.getLogger(process_pull_request_files.__name__) + + with GithubTaskManifest(logger) as manifest: + db_session = manifest.db_session + query = db_session.session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + pull_request_files_model(repo.repo_id, logger, db_session, manifest.key_auth, full_collection) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/collectoss/tasks/github/pull_requests/tasks.py similarity index 83% rename from augur/tasks/github/pull_requests/tasks.py rename to collectoss/tasks/github/pull_requests/tasks.py index 3d9f0a4a2..f8966ee6e 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/collectoss/tasks/github/pull_requests/tasks.py @@ -1,36 +1,37 @@ import logging from datetime import datetime, timedelta, timezone -from augur.tasks.github.pull_requests.core import extract_data_from_pr_list -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.data_parse import * -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo -from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id, batch_insert_contributors -from augur.application.db.util import execute_session_query +from collectoss.tasks.github.pull_requests.core import extract_data_from_pr_list +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask, SecondaryRepoCollectionTask +from collectoss.application.db.data_parse import * +from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo +from collectoss.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo +from collectoss.tasks.github.util.github_task_session import GithubTaskManifest +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id, batch_insert_contributors, get_batch_size +from collectoss.application.db.util import execute_session_query from ..messages import process_github_comment_contributors -from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs, get_core_data_last_collected +from collectoss.application.db.lib import get_secondary_data_last_collected, get_updated_prs, get_core_data_last_collected from typing import List + platform_id = 1 -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_pull_requests(repo_git: str, full_collection: bool) -> int: logger = logging.getLogger(collect_pull_requests.__name__) with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + db_session = manifest.db_session - repo_id = augur_db.session.query(Repo).filter( + repo_id = db_session.session.query(Repo).filter( Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) @@ -41,19 +42,21 @@ def collect_pull_requests(repo_git: str, full_collection: bool) -> int: # subtract 2 days to ensure all data is collected core_data_last_collected = (get_core_data_last_collected(repo_id) - timedelta(days=2)).replace(tzinfo=timezone.utc) + pr_batch_size = get_batch_size() + total_count = 0 all_data = [] for pr in retrieve_all_pr_data(repo_git, logger, manifest.key_auth, core_data_last_collected): - + all_data.append(pr) - if len(all_data) >= 1000: - process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db) + if len(all_data) >= pr_batch_size: + process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, db_session) total_count += len(all_data) all_data.clear() if all_data: - process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db) + process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, db_session) total_count += len(all_data) if total_count > 0: @@ -90,16 +93,16 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ if since and datetime.fromisoformat(pr["updated_at"].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) < since: return -def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): +def process_pull_requests(pull_requests, task_name, repo_id, logger, db_session): """ Parse and insert all retrieved PR data. Arguments: pull_requests: List of paginated pr endpoint data task_name: Name of the calling task and the repo - repo_id: augur id of the repository + repo_id: collectoss id for the repository logger: logging object - augur_db: sqlalchemy db object + db_session: sqlalchemy db object """ tool_source = "Pr Task" tool_version = "2.0" @@ -112,7 +115,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + db_session.insert_data(contributors, Contributor, ["cntrb_id"]) # insert the prs into the pull_requests table. @@ -122,7 +125,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): pr_natural_keys = ["repo_id", "pr_src_id"] pr_return_columns = ["pull_request_id", "pr_url"] pr_string_fields = ["pr_src_title", "pr_body"] - pr_return_data = augur_db.insert_data(pr_dicts, PullRequest, pr_natural_keys, + pr_return_data = db_session.insert_data(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) if pr_return_data is None: @@ -161,24 +164,24 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - augur_db.insert_data(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + db_session.insert_data(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) # inserting pr assignees # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - augur_db.insert_data(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) + db_session.insert_data(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) # inserting pr requested reviewers # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - augur_db.insert_data(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) + db_session.insert_data(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] pr_metadata_string_fields = ["pr_src_meta_label"] - augur_db.insert_data(pr_metadata_dicts, PullRequestMeta, + db_session.insert_data(pr_metadata_dicts, PullRequestMeta, pr_metadata_natural_keys, string_fields=pr_metadata_string_fields) @@ -194,7 +197,7 @@ def process_pull_request_review_contributor(pr_review: dict, tool_source: str, t return pr_review_cntrb -@celery.task(base=AugurSecondaryRepoCollectionTask) +@celery.task(base=SecondaryRepoCollectionTask) def collect_pull_request_review_comments(repo_git: str, full_collection: bool) -> None: """ Collect pull request review comments for a repository from the GitHub API. @@ -213,7 +216,7 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - None. Data is inserted directly into the database. Note: - - Inherits error handling from AugurSecondaryRepoCollectionTask base class. + - Inherits error handling from SecondaryRepoCollectionTask base class. - Contributors are deduplicated within each batch before insertion. - Uses ON CONFLICT upsert logic to handle duplicate messages gracefully. """ @@ -238,9 +241,13 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - pr_reviews = get_pull_request_reviews_by_repo_id(repo_id) - # Build mapping once: github pr_review_src_id -> augur pr_review_id + # Build mapping once: github pr_review_src_id -> collectoss pr_review_id pr_review_id_mapping = {review.pr_review_src_id: review.pr_review_id for review in pr_reviews} + if not pr_review_id_mapping: + logger.debug(f"{owner}/{repo} No PR reviews to collect review comments for") + return + tool_source = "Pr review comment task" tool_version = "2.0" data_source = "Github API" @@ -248,8 +255,9 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - key_auth = GithubRandomKeyAuth(logger) github_data_access = GithubDataAccess(key_auth, logger) + pr_review_comment_batch_size = get_batch_size() + # Batch processing: accumulate comments until batch size reached, then flush - COMMENT_BATCH_SIZE = 1000 contributors = [] pr_review_comment_dicts = [] pr_review_msg_mapping_data = {} @@ -271,7 +279,7 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - pr_review_msg_mapping_data[comment["id"]] = comment # Flush batch when threshold reached (check both to prevent unbounded growth) - if len(pr_review_comment_dicts) >= COMMENT_BATCH_SIZE or len(contributors) >= COMMENT_BATCH_SIZE: + if len(pr_review_comment_dicts) >= pr_review_comment_batch_size or len(contributors) >= pr_review_comment_batch_size: refs_inserted = _flush_pr_review_comment_batch( logger, contributors, pr_review_comment_dicts, pr_review_msg_mapping_data, pr_review_id_mapping, repo_id, tool_version, data_source, owner, repo @@ -316,7 +324,7 @@ def _flush_contributors(logger, contributors: list, owner: str, repo: str, conte batch_insert_contributors(logger, unique_contributors) -def _flush_pr_review_batch(augur_db, contributors: list, pr_reviews: list, logger, owner: str, repo: str) -> None: +def _flush_pr_review_batch(db_session, contributors: list, pr_reviews: list, logger, owner: str, repo: str) -> None: """ Insert accumulated PR review batch data into the database. @@ -324,7 +332,7 @@ def _flush_pr_review_batch(augur_db, contributors: list, pr_reviews: list, logge contributors and PR reviews. Uses ON CONFLICT upsert logic via insert_data(). Args: - augur_db: DatabaseSession instance for database operations. + db_session: DatabaseSession instance for database operations. contributors: List of contributor dicts to insert. Will be deduplicated using remove_duplicate_dicts() before insertion. pr_reviews: List of PR review dicts to insert. @@ -341,7 +349,7 @@ def _flush_pr_review_batch(augur_db, contributors: list, pr_reviews: list, logge logger.info(f"{owner}/{repo}: Inserting {len(pr_reviews)} pr reviews") pr_review_natural_keys = ["pr_review_src_id"] pr_review_string_fields = ["pr_review_body"] - augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys, string_fields=pr_review_string_fields) + db_session.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys, string_fields=pr_review_string_fields) def _flush_pr_review_comment_batch( @@ -370,7 +378,7 @@ def _flush_pr_review_comment_batch( pr_review_comment_dicts: List of message dicts to insert into Message table. pr_review_msg_mapping_data: Dict mapping github_msg_id to raw comment data (needed for creating review refs after message insert). - pr_review_id_mapping: Dict mapping github pr_review_src_id to augur pr_review_id. + pr_review_id_mapping: Dict mapping github pr_review_src_id to collectoss pr_review_id. repo_id: The repository ID. tool_version: Tool version string for metadata. data_source: Data source string for metadata. @@ -399,22 +407,22 @@ def _flush_pr_review_comment_batch( pr_review_message_ref_insert_data = [] for data in message_return_data: - augur_msg_id = data["msg_id"] + msg_id = data["msg_id"] github_msg_id = data["platform_msg_id"] comment = pr_review_msg_mapping_data[github_msg_id] - comment["msg_id"] = augur_msg_id + comment["msg_id"] = msg_id github_pr_review_id = comment["pull_request_review_id"] try: - augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] + pr_review_id = pr_review_id_mapping[github_pr_review_id] except KeyError: logger.warning(f"{owner}/{repo}: Could not find related pr review. We were searching for pr review with id: {github_pr_review_id}") continue pr_review_message_ref = extract_pr_review_message_ref_data( - comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source + comment, pr_review_id, github_pr_review_id, repo_id, tool_version, data_source ) pr_review_message_ref_insert_data.append(pr_review_message_ref) @@ -430,7 +438,7 @@ def _flush_pr_review_comment_batch( return len(pr_review_message_ref_insert_data) -@celery.task(base=AugurSecondaryRepoCollectionTask) +@celery.task(base=SecondaryRepoCollectionTask) def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: """ Collect pull request reviews for a repository from the GitHub API. @@ -449,7 +457,7 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: None. Data is inserted directly into the database. Note: - - Inherits error handling from AugurSecondaryRepoCollectionTask base class. + - Inherits error handling from SecondaryRepoCollectionTask base class. - Contributors are deduplicated within each batch before insertion. - Uses ON CONFLICT upsert logic to handle duplicate reviews gracefully. """ @@ -463,13 +471,13 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + db_session = manifest.db_session - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + query = db_session.session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id if full_collection: - query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) + query = db_session.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) prs = execute_session_query(query, 'all') else: last_collected = get_secondary_data_last_collected(repo_id).date() @@ -484,8 +492,9 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: github_data_access = GithubDataAccess(manifest.key_auth, logger) + pr_review_batch_size = get_batch_size() + # Batch processing: accumulate reviews until batch size reached, then flush - REVIEW_BATCH_SIZE = 1000 contributors = [] pr_review_dicts = [] total_reviews_collected = 0 @@ -520,15 +529,15 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: ) # Flush batch when threshold reached - if len(pr_review_dicts) >= REVIEW_BATCH_SIZE: - _flush_pr_review_batch(augur_db, contributors, pr_review_dicts, logger, owner, repo) + if len(pr_review_dicts) >= pr_review_batch_size: + _flush_pr_review_batch(db_session, contributors, pr_review_dicts, logger, owner, repo) total_reviews_collected += len(pr_review_dicts) contributors.clear() pr_review_dicts.clear() # Flush any remaining data if pr_review_dicts: - _flush_pr_review_batch(augur_db, contributors, pr_review_dicts, logger, owner, repo) + _flush_pr_review_batch(db_session, contributors, pr_review_dicts, logger, owner, repo) total_reviews_collected += len(pr_review_dicts) if total_reviews_collected == 0: diff --git a/augur/tasks/github/releases/__init__.py b/collectoss/tasks/github/releases/__init__.py similarity index 100% rename from augur/tasks/github/releases/__init__.py rename to collectoss/tasks/github/releases/__init__.py diff --git a/augur/tasks/github/releases/core.py b/collectoss/tasks/github/releases/core.py similarity index 93% rename from augur/tasks/github/releases/core.py rename to collectoss/tasks/github/releases/core.py index f6b2f5e56..643e1eb63 100644 --- a/augur/tasks/github/releases/core.py +++ b/collectoss/tasks/github/releases/core.py @@ -1,10 +1,11 @@ #SPDX-License-Identifier: MIT -from augur.tasks.github.util.github_task_session import * -from augur.application.db.models import * -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict -from augur.application.db.util import execute_session_query -from augur.application.db.lib import bulk_insert_dicts +from collectoss.tasks.github.util.github_task_session import * +from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from collectoss.application.db.models import * +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.tasks.github.util.gh_graphql_entities import request_graphql_dict +from collectoss.application.db.util import execute_session_query +from collectoss.application.db.lib import bulk_insert_dicts def get_release_inf(repo_id, release, tag_only): @@ -159,7 +160,7 @@ def fetch_data(key_auth, logger, github_url, repo_id, tag_only = False): owner, repo = get_owner_repo(github_url) - url = 'https://api.github.com/graphql' + url = GithubGraphQlDataAccess.base_url() query = get_query(logger, owner, repo, tag_only) diff --git a/collectoss/tasks/github/releases/tasks.py b/collectoss/tasks/github/releases/tasks.py new file mode 100644 index 000000000..6ffbb6412 --- /dev/null +++ b/collectoss/tasks/github/releases/tasks.py @@ -0,0 +1,22 @@ +import logging + +from collectoss.tasks.github.releases.core import * +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.application.db.lib import get_repo_by_repo_git, get_session +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + + +@celery.task(base=CoreRepoCollectionTask) +def collect_releases(repo_git): + + logger = logging.getLogger(collect_releases.__name__) + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id + + key_auth = GithubRandomKeyAuth(logger) + + with get_session() as session: + + releases_model(session, key_auth, logger, repo_git, repo_id) \ No newline at end of file diff --git a/augur/tasks/github/repo_info/__init__.py b/collectoss/tasks/github/repo_info/__init__.py similarity index 100% rename from augur/tasks/github/repo_info/__init__.py rename to collectoss/tasks/github/repo_info/__init__.py diff --git a/augur/tasks/github/repo_info/core.py b/collectoss/tasks/github/repo_info/core.py similarity index 94% rename from augur/tasks/github/repo_info/core.py rename to collectoss/tasks/github/repo_info/core.py index 57cd970bc..55b1def2a 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/collectoss/tasks/github/repo_info/core.py @@ -1,15 +1,15 @@ #SPDX-License-Identifier: MIT import json import sqlalchemy as s -from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException -from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess -from augur.tasks.github.util.github_paginator import hit_api -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict -from augur.application.db.models import * -from augur.application.db.lib import execute_sql -from augur.tasks.github.util.github_task_session import * -from augur.application.db.models.augur_data import RepoBadging +from collectoss.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException +from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from collectoss.tasks.github.util.github_paginator import hit_api +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.tasks.github.util.gh_graphql_entities import request_graphql_dict +from collectoss.application.db.models import * +from collectoss.application.db.lib import execute_sql +from collectoss.tasks.github.util.github_task_session import * +from collectoss.application.db.models.augur_data import RepoBadging from urllib.parse import quote def query_committers_count(key_auth, logger, owner, repo): @@ -93,7 +93,7 @@ def is_archived(logger, repo_data): return False def grab_repo_info_from_graphql_endpoint(key_auth, logger, query): - url = 'https://api.github.com/graphql' + url = GithubGraphQlDataAccess.base_url() # Hit the graphql endpoint and retry 3 times in case of failure logger.info("Hitting endpoint: {} ...\n".format(url)) data = request_graphql_dict(key_auth, logger, url, query) diff --git a/augur/tasks/github/repo_info/tasks.py b/collectoss/tasks/github/repo_info/tasks.py similarity index 55% rename from augur/tasks/github/repo_info/tasks.py rename to collectoss/tasks/github/repo_info/tasks.py index 85d639d2a..66144dfc5 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/collectoss/tasks/github/repo_info/tasks.py @@ -1,16 +1,16 @@ import logging -from augur.application.db.session import DatabaseSession -from augur.tasks.github.repo_info.core import * -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.lib import get_repo_by_repo_git -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.application.db import get_engine +from collectoss.application.db.session import DatabaseSession +from collectoss.tasks.github.repo_info.core import * +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.application.db.lib import get_repo_by_repo_git +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.application.db import get_engine #Task to get regular misc github info -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_repo_info(repo_git: str): logger = logging.getLogger(collect_repo_info.__name__) @@ -23,7 +23,7 @@ def collect_repo_info(repo_git: str): #Task to get CII api data for linux badge info using github data. -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_linux_badge_info(repo_git: str): engine = get_engine() diff --git a/augur/tasks/github/traffic.py b/collectoss/tasks/github/traffic.py similarity index 79% rename from augur/tasks/github/traffic.py rename to collectoss/tasks/github/traffic.py index 8f1903e4e..163b97ad5 100644 --- a/augur/tasks/github/traffic.py +++ b/collectoss/tasks/github/traffic.py @@ -1,12 +1,12 @@ import logging -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.data_parse import extract_needed_clone_history_data -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import RepoClone -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.data_parse import extract_needed_clone_history_data +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.tasks.github.util.util import get_owner_repo +from collectoss.application.db.models import RepoClone +from collectoss.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @celery.task diff --git a/augur/tasks/github/util/__init__.py b/collectoss/tasks/github/util/__init__.py similarity index 100% rename from augur/tasks/github/util/__init__.py rename to collectoss/tasks/github/util/__init__.py diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/collectoss/tasks/github/util/gh_graphql_entities.py similarity index 99% rename from augur/tasks/github/util/gh_graphql_entities.py rename to collectoss/tasks/github/util/gh_graphql_entities.py index 0667ab331..bb5f95e98 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/collectoss/tasks/github/util/gh_graphql_entities.py @@ -1,4 +1,4 @@ -from augur.tasks.github.util.github_task_session import * +from collectoss.tasks.github.util.github_task_session import * #from gql import gql, Client #from gql.transport.aiohttp import AIOHTTPTransport import httpx @@ -6,7 +6,7 @@ import collections import time import traceback -from augur.tasks.github.util.github_paginator import GithubApiResult, process_dict_response +from collectoss.tasks.github.util.github_paginator import GithubApiResult, process_dict_response """ Should be designed on a per entity basis that has attributes that call diff --git a/augur/tasks/github/util/github_api_key_handler.py b/collectoss/tasks/github/util/github_api_key_handler.py similarity index 97% rename from augur/tasks/github/util/github_api_key_handler.py rename to collectoss/tasks/github/util/github_api_key_handler.py index 47933e67d..5cabe1fab 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/collectoss/tasks/github/util/github_api_key_handler.py @@ -5,8 +5,8 @@ from typing import List from sqlalchemy.orm import Session -from augur.tasks.util.redis_list import RedisList -from augur.application.db.lib import get_value, get_worker_oauth_keys +from collectoss.tasks.util.redis_list import RedisList +from collectoss.application.db.lib import get_value, get_worker_oauth_keys from sqlalchemy import func RATE_LIMIT_URL = "https://api.github.com/rate_limit" diff --git a/augur/tasks/github/util/github_data_access.py b/collectoss/tasks/github/util/github_data_access.py similarity index 97% rename from augur/tasks/github/util/github_data_access.py rename to collectoss/tasks/github/util/github_data_access.py index 782728233..18256fe68 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/collectoss/tasks/github/util/github_data_access.py @@ -4,6 +4,7 @@ from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception, RetryError from urllib.parse import urlparse, parse_qs, urlencode from keyman.KeyClient import KeyClient +from collectoss.util.keys import mask_key GITHUB_RATELIMIT_REMAINING_CAP = 50 @@ -13,7 +14,7 @@ class RatelimitException(Exception): def __init__(self, response, keys_used, message="Github Rate limit exceeded") -> None: self.response = response - + keys_used = [ mask_key(k) for k in keys_used] super().__init__(f"{message}. Keys used: {keys_used}") class UrlNotFoundException(Exception): @@ -122,8 +123,8 @@ def make_request(self, url, method="GET", timeout=100): if response.status_code in [403, 429]: self.expired_keys_for_request.append(self.key) - self.logger.warning(f"Github rate limit exceeded. Key: {self.key[-5:]}. Response: {response.text}") - raise RatelimitException(response, self.expired_keys_for_request[-5:]) + self.logger.warning(f"Github rate limit exceeded. Key: {mask_key(self.key)}. Response: {response.text}") + raise RatelimitException(response, self.expired_keys_for_request) # There are cases with PR files, PR commits, and messages where the parent object is removed after # It is collected, leading the the associated URL for those objects to return a 404. @@ -147,7 +148,7 @@ def make_request(self, url, method="GET", timeout=100): try: if self.feature == "rest" and "X-RateLimit-Remaining" in response.headers and int(response.headers["X-RateLimit-Remaining"]) < GITHUB_RATELIMIT_REMAINING_CAP: self.expired_keys_for_request.append(self.key) - raise RatelimitException(response, self.expired_keys_for_request[-5:]) + raise RatelimitException(response, self.expired_keys_for_request) except ValueError: self.logger.warning(f"X-RateLimit-Remaining was not an integer. Value: {response.headers['X-RateLimit-Remaining']}") @@ -225,7 +226,7 @@ def __handle_github_ratelimit_response(self, response): self.key = self.key_client.expire(self.key, time.time() + 60) if previous_key == self.key: - self.logger.error(f"The same key was returned after a request to expire it was sent (key: {self.key[-5:]})") + self.logger.error(f"The same key was returned after a request to expire it was sent (key: {mask_key(self.key)})") def __add_query_params(self, url: str, additional_params: dict) -> str: """Add query params to a url. diff --git a/augur/tasks/github/util/github_graphql_data_access.py b/collectoss/tasks/github/util/github_graphql_data_access.py similarity index 98% rename from augur/tasks/github/util/github_graphql_data_access.py rename to collectoss/tasks/github/util/github_graphql_data_access.py index 96b0c6ab7..dc047b188 100644 --- a/augur/tasks/github/util/github_graphql_data_access.py +++ b/collectoss/tasks/github/util/github_graphql_data_access.py @@ -20,7 +20,12 @@ class InvalidDataException(Exception): pass class GithubGraphQlDataAccess: + """Utilities for accessing the GitHub GraphQL API + """ + @staticmethod + def base_url(): + return URL def __init__(self, key_manager, logger: logging.Logger, ingore_not_found_error=False): diff --git a/augur/tasks/github/util/github_paginator.py b/collectoss/tasks/github/util/github_paginator.py similarity index 100% rename from augur/tasks/github/util/github_paginator.py rename to collectoss/tasks/github/util/github_paginator.py diff --git a/augur/tasks/github/util/github_random_key_auth.py b/collectoss/tasks/github/util/github_random_key_auth.py similarity index 92% rename from augur/tasks/github/util/github_random_key_auth.py rename to collectoss/tasks/github/util/github_random_key_auth.py index 1c7fc74e8..6797ba785 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/collectoss/tasks/github/util/github_random_key_auth.py @@ -1,7 +1,7 @@ """Defines the GithubRandomKeyAuth class""" -from augur.tasks.util.random_key_auth import RandomKeyAuth -from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from collectoss.tasks.util.random_key_auth import RandomKeyAuth +from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from sqlalchemy.orm import Session class GithubRandomKeyAuth(RandomKeyAuth): diff --git a/augur/tasks/github/util/github_task_session.py b/collectoss/tasks/github/util/github_task_session.py similarity index 78% rename from augur/tasks/github/util/github_task_session.py rename to collectoss/tasks/github/util/github_task_session.py index 2869643bd..c9c80ab57 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/collectoss/tasks/github/util/github_task_session.py @@ -1,8 +1,8 @@ from logging import Logger -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.application.db.session import DatabaseSession -from augur.application.db import get_engine +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db import get_engine class GithubTaskManifest: @@ -10,8 +10,8 @@ def __init__(self, logger): engine = get_engine() - self.augur_db = DatabaseSession(logger, engine) - #self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + self.db_session = DatabaseSession(logger, engine) + #self.key_auth = GithubRandomKeyAuth(self.db_session.session, logger) #totalHack self.key_auth = GithubRandomKeyAuth(logger) self.logger = logger @@ -23,7 +23,7 @@ def __enter__(self): def __exit__(self, exception_type, exception_value, exception_traceback): - self.augur_db.close() + self.db_session.close() class GithubTaskSession(DatabaseSession): @@ -53,10 +53,10 @@ def __init__(self, logger: Logger, engine=None): File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__ return self.run(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/ubuntu/github/augur/augur/tasks/frontend.py", line 24, in add_org_repo_list + File "/home/ubuntu/github/collectoss/collectoss/tasks/frontend.py", line 24, in add_org_repo_list with GithubTaskSession(logger) as session: ^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/ubuntu/github/augur/augur/tasks/github/util/github_task_session.py", line 44, in __init__ + File "/home/ubuntu/github/collectoss/collectoss/tasks/github/util/github_task_session.py", line 44, in __init__ self.oauths = GithubRandomKeyAuth(self, logger) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: GithubRandomKeyAuth.__init__() takes 2 positional arguments but 3 were given diff --git a/augur/tasks/github/util/populate_repo_src_id.py b/collectoss/tasks/github/util/populate_repo_src_id.py similarity index 74% rename from augur/tasks/github/util/populate_repo_src_id.py rename to collectoss/tasks/github/util/populate_repo_src_id.py index 4346bcc3d..ae712a3e0 100644 --- a/augur/tasks/github/util/populate_repo_src_id.py +++ b/collectoss/tasks/github/util/populate_repo_src_id.py @@ -1,9 +1,9 @@ import logging import sqlalchemy as s -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_repo_by_repo_git, execute_sql -from augur.tasks.github.util.util import get_owner_repo, get_repo_src_id +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.lib import get_repo_by_repo_git, execute_sql +from collectoss.tasks.github.util.util import get_owner_repo, get_repo_src_id @celery.task def populate_repo_src_id_task(repo_git): diff --git a/augur/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py similarity index 89% rename from augur/tasks/github/util/util.py rename to collectoss/tasks/github/util/util.py index 76e541992..a0f009855 100644 --- a/augur/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -4,10 +4,10 @@ import urllib.parse import json import httpx -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess -from augur.application.db.lib import get_repo_by_repo_git -from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +from collectoss.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess +from collectoss.application.db.lib import get_repo_by_repo_git +from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps def get_repo_src_id(owner, repo, logger): @@ -99,7 +99,7 @@ def get_repo_weight_by_issue(logger,repo_git): Sum of issues and prs for that repo """ - from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql + from collectoss.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql owner,name = get_owner_repo(repo_git) diff --git a/augur/tasks/gitlab/__init__.py b/collectoss/tasks/gitlab/__init__.py similarity index 100% rename from augur/tasks/gitlab/__init__.py rename to collectoss/tasks/gitlab/__init__.py diff --git a/augur/tasks/gitlab/events_task.py b/collectoss/tasks/gitlab/events_task.py similarity index 86% rename from augur/tasks/gitlab/events_task.py rename to collectoss/tasks/gitlab/events_task.py index 5c85bc5d8..0acb69e05 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/collectoss/tasks/gitlab/events_task.py @@ -1,21 +1,21 @@ """ -Module to define the task methods to collect gitlab event data for augur +Module to define the task methods to collect gitlab event data for collectoss """ import logging -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data -from augur.tasks.github.util.util import get_gitlab_repo_identifier -from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session -from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from collectoss.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data +from collectoss.tasks.github.util.util import get_gitlab_repo_identifier +from collectoss.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo +from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session +from collectoss.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth platform_id = 2 -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_gitlab_issue_events(repo_git) -> int: """ Retrieve and parse gitlab events for the desired repo @@ -43,7 +43,7 @@ def collect_gitlab_issue_events(repo_git) -> int: logger.info(f"{owner}/{repo} has no gitlab issue events") -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_gitlab_merge_request_events(repo_git) -> int: """ Retrieve and parse gitlab mrs for the desired repo @@ -117,7 +117,7 @@ def process_issue_events(events, task_name, repo_id, logger, session): Arguments: events: List of dictionaries of issue event data task_name: name of the task as well as the repo being processed - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object session: sqlalchemy db object """ @@ -162,9 +162,9 @@ def process_mr_events(events, task_name, repo_id, logger, session): Arguments: labels: List of dictionaries of label data - repo_id: augur id of the repository - tool_source: The part of augur that processed the data - tool_version: The version of the augur task that processed the data + repo_id: collectoss id for the repository + tool_source: The part of collectoss that processed the data + tool_version: The version of the collectoss task that processed the data data_source: The source of the data diff --git a/augur/tasks/gitlab/gitlab_api_handler.py b/collectoss/tasks/gitlab/gitlab_api_handler.py similarity index 98% rename from augur/tasks/gitlab/gitlab_api_handler.py rename to collectoss/tasks/gitlab/gitlab_api_handler.py index 711688b2b..7dbe1a3a7 100644 --- a/augur/tasks/gitlab/gitlab_api_handler.py +++ b/collectoss/tasks/gitlab/gitlab_api_handler.py @@ -10,8 +10,8 @@ from urllib.parse import urlencode, urlparse, parse_qs, urlunparse from enum import Enum -from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth -from augur.tasks.github.util.util import parse_json_response +from collectoss.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from collectoss.tasks.github.util.util import parse_json_response class GitlabApiResult(Enum): """All the different results of querying the Gitlab API.""" diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/collectoss/tasks/gitlab/gitlab_api_key_handler.py similarity index 97% rename from augur/tasks/gitlab/gitlab_api_key_handler.py rename to collectoss/tasks/gitlab/gitlab_api_key_handler.py index 72b0ace14..1114fe504 100644 --- a/augur/tasks/gitlab/gitlab_api_key_handler.py +++ b/collectoss/tasks/gitlab/gitlab_api_key_handler.py @@ -9,8 +9,8 @@ from typing import List -from augur.tasks.util.redis_list import RedisList -from augur.application.db.lib import get_value, get_worker_oauth_keys +from collectoss.tasks.util.redis_list import RedisList +from collectoss.application.db.lib import get_value, get_worker_oauth_keys class NoValidKeysError(Exception): diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/collectoss/tasks/gitlab/gitlab_random_key_auth.py similarity index 84% rename from augur/tasks/gitlab/gitlab_random_key_auth.py rename to collectoss/tasks/gitlab/gitlab_random_key_auth.py index 3269d1ec3..292d66959 100644 --- a/augur/tasks/gitlab/gitlab_random_key_auth.py +++ b/collectoss/tasks/gitlab/gitlab_random_key_auth.py @@ -1,6 +1,6 @@ """Defines the GitlabRandomKeyAuth class""" -from augur.tasks.util.random_key_auth import RandomKeyAuth -from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler +from collectoss.tasks.util.random_key_auth import RandomKeyAuth +from collectoss.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler class GitlabRandomKeyAuth(RandomKeyAuth): diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/collectoss/tasks/gitlab/gitlab_task_session.py similarity index 81% rename from augur/tasks/gitlab/gitlab_task_session.py rename to collectoss/tasks/gitlab/gitlab_task_session.py index 3f65f89f4..42d8b8fea 100644 --- a/augur/tasks/gitlab/gitlab_task_session.py +++ b/collectoss/tasks/gitlab/gitlab_task_session.py @@ -3,9 +3,9 @@ """ from logging import Logger -from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth -from augur.application.db.session import DatabaseSession -from augur.application.db import get_engine +from collectoss.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db import get_engine class GitlabTaskSession(DatabaseSession): """ORM session used in gitlab tasks. diff --git a/augur/tasks/gitlab/issues_task.py b/collectoss/tasks/gitlab/issues_task.py similarity index 90% rename from augur/tasks/gitlab/issues_task.py rename to collectoss/tasks/gitlab/issues_task.py index 8a1415a7d..5d08f08a3 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/collectoss/tasks/gitlab/issues_task.py @@ -4,19 +4,19 @@ import logging import traceback -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data -from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor, Repo -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session, batch_insert_contributors -from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from collectoss.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data +from collectoss.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts +from collectoss.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor, Repo +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session, batch_insert_contributors +from collectoss.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth platform_id = 2 -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_gitlab_issues(repo_git : str) -> int: """ Retrieve and parse gitlab issues for the desired repo @@ -92,7 +92,7 @@ def process_issues(issues, task_name, repo_id, logger) -> None: Arguments: issues: List of dictionaries of issue data task_name: name of the task as well as the repo being processed - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object session: sqlalchemy db object """ @@ -199,7 +199,7 @@ def process_issue_contributors(issue, tool_source, tool_version, data_source): return issue, contributors -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: """ Retrieve and parse gitlab events for the desired repo @@ -277,7 +277,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, session): Arguments: data: List of dictionaries of issue event data task_name: name of the task as well as the repo being processed - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object session: sqlalchemy db object """ @@ -337,12 +337,12 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, session): issue_message_ref_dicts = [] for data in message_return_data: - augur_msg_id = data["msg_id"] + msg_id = data["msg_id"] platform_message_id = data["platform_msg_id"] ref = message_ref_mapping_data[platform_message_id] message_ref_data = ref["msg_ref_data"] - message_ref_data["msg_id"] = augur_msg_id + message_ref_data["msg_id"] = msg_id issue_message_ref_dicts.append(message_ref_data) diff --git a/augur/tasks/gitlab/merge_request_task.py b/collectoss/tasks/gitlab/merge_request_task.py similarity index 92% rename from augur/tasks/gitlab/merge_request_task.py rename to collectoss/tasks/gitlab/merge_request_task.py index 7a3b00618..151de33ae 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/collectoss/tasks/gitlab/merge_request_task.py @@ -1,18 +1,18 @@ import logging -from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data -from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee -from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session, batch_insert_contributors +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import CoreRepoCollectionTask +from collectoss.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from collectoss.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data +from collectoss.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts +from collectoss.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee +from collectoss.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from collectoss.tasks.util.worker_util import remove_duplicate_dicts +from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session, batch_insert_contributors platform_id = 2 -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_gitlab_merge_requests(repo_git: str) -> int: """ Retrieve and parse gitlab MRs for the desired repo @@ -87,7 +87,7 @@ def process_merge_requests(data, task_name, repo_id, logger): Arguments: data: collection of mr data task_name: name of the task as well as the repo being processed - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object Returns: @@ -163,7 +163,7 @@ def process_merge_requests(data, task_name, repo_id, logger): -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_merge_request_comments(mr_ids, repo_git) -> int: """ Retrieve and parse gitlab events for the desired repo @@ -202,7 +202,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, session): Arguments: data: List of dictionaries of mr message data task_name: name of the task as well as the repo being processed - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object session: sqlalchemy db object """ @@ -262,12 +262,12 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, session): mr_message_ref_dicts = [] for data in message_return_data: - augur_msg_id = data["msg_id"] + msg_id = data["msg_id"] platform_message_id = data["platform_msg_id"] ref = message_ref_mapping_data[platform_message_id] message_ref_data = ref["msg_ref_data"] - message_ref_data["msg_id"] = augur_msg_id + message_ref_data["msg_id"] = msg_id mr_message_ref_dicts.append(message_ref_data) @@ -276,7 +276,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, session): bulk_insert_dicts(logger, mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_merge_request_metadata(mr_ids, repo_git) -> int: """ Retrieve and parse gitlab events for the desired repo @@ -314,7 +314,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, session): Arguments: data: List of dictionaries of mr metadata task_name: name of the task as well as the repo being processed - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object session: sqlalchemy db object """ @@ -343,7 +343,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, session): bulk_insert_dicts(logger, all_metadata, PullRequestMeta, pr_metadata_natural_keys) -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_merge_request_reviewers(mr_ids, repo_git) -> int: """ Retrieve and parse mr reviewers for the desired repo @@ -380,7 +380,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, session): Arguments: data: List of dictionaries of mr Reviewer data - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object session: sqlalchemy db object """ @@ -412,7 +412,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, session): -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_merge_request_commits(mr_ids, repo_git) -> int: """ Retrieve and parse mr commits for the desired repo @@ -451,7 +451,7 @@ def process_mr_commits(data, task_name, repo_id, logger, session): Arguments: data: List of dictionaries of mr commit data task_name: name of the task as well as the repo being processed - repo_id: augur id of the repo + repo_id: collectoss id for the repo logger: logging object session: sqlalchemy db object """ @@ -482,7 +482,7 @@ def process_mr_commits(data, task_name, repo_id, logger, session): -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=CoreRepoCollectionTask) def collect_merge_request_files(mr_ids, repo_git) -> int: """ Retrieve and parse gitlab events for the desired repo diff --git a/augur/tasks/init/__init__.py b/collectoss/tasks/init/__init__.py similarity index 72% rename from augur/tasks/init/__init__.py rename to collectoss/tasks/init/__init__.py index 194a5ed83..8f193ae3a 100644 --- a/augur/tasks/init/__init__.py +++ b/collectoss/tasks/init/__init__.py @@ -1,8 +1,8 @@ import logging -from augur.application.db.session import DatabaseSession -from augur.application.db.engine import DatabaseEngine -from augur.application.config import AugurConfig +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db.engine import DatabaseEngine +from collectoss.application.config import SystemConfig def get_redis_conn_values(): @@ -10,7 +10,7 @@ def get_redis_conn_values(): with DatabaseEngine() as engine, DatabaseSession(logger, engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) redis_db_number = config.get_value("Redis", "cache_group") * 3 redis_conn_string = config.get_value("Redis", "connection_string") @@ -24,7 +24,7 @@ def get_rabbitmq_conn_string(): logger = logging.getLogger(__name__) with DatabaseEngine() as engine, DatabaseSession(logger, engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) rabbbitmq_conn_string = config.get_value("RabbitMQ", "connection_string") diff --git a/augur/tasks/init/celery_app.py b/collectoss/tasks/init/celery_app.py similarity index 53% rename from augur/tasks/init/celery_app.py rename to collectoss/tasks/init/celery_app.py index d1209fadd..e14230f99 100644 --- a/augur/tasks/init/celery_app.py +++ b/collectoss/tasks/init/celery_app.py @@ -12,54 +12,54 @@ from celery.signals import after_setup_logger -from augur.application.logs import TaskLogConfig, AugurLogger -from augur.application.db.session import DatabaseSession -from augur.application.db import get_engine -from augur.application.db.lib import get_session -from augur.application.config import AugurConfig -from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string -from augur.application.db.models import Repo -from augur.tasks.util.collection_state import CollectionState +from collectoss.application.logs import TaskLogConfig, SystemLogger +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db import get_engine +from collectoss.application.db.lib import get_session +from collectoss.application.config import SystemConfig +from collectoss.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string +from collectoss.application.db.models import Repo +from collectoss.tasks.util.collection_state import CollectionState logger = logging.getLogger(__name__) -start_tasks = ['augur.tasks.start_tasks', - 'augur.tasks.data_analysis', - 'augur.tasks.util.collection_util'] - -github_tasks = ['augur.tasks.github.contributors', - 'augur.tasks.github.issues', - 'augur.tasks.github.pull_requests.tasks', - 'augur.tasks.github.events', - 'augur.tasks.github.messages', - 'augur.tasks.github.facade_github.tasks', - 'augur.tasks.github.releases.tasks', - 'augur.tasks.github.repo_info.tasks', - 'augur.tasks.github.detect_move.tasks', - 'augur.tasks.github.pull_requests.files_model.tasks', - 'augur.tasks.github.pull_requests.commits_model.tasks', - 'augur.tasks.github.traffic', - 'augur.tasks.github.util.populate_repo_src_id'] - -gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', - 'augur.tasks.gitlab.issues_task', - 'augur.tasks.gitlab.events_task'] - -git_tasks = ['augur.tasks.git.facade_tasks', - 'augur.tasks.git.dependency_tasks.tasks', - 'augur.tasks.git.dependency_libyear_tasks.tasks', - 'augur.tasks.git.scc_value_tasks.tasks'] - -data_analysis_tasks = ['augur.tasks.data_analysis.message_insights.tasks', - 'augur.tasks.data_analysis.clustering_worker.tasks', - 'augur.tasks.data_analysis.discourse_analysis.tasks', - 'augur.tasks.data_analysis.pull_request_analysis_worker.tasks', - 'augur.tasks.data_analysis.insight_worker.tasks', - 'augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker'] - -materialized_view_tasks = ['augur.tasks.db.refresh_materialized_views'] - -frontend_tasks = ['augur.tasks.frontend'] +start_tasks = ['collectoss.tasks.start_tasks', + 'collectoss.tasks.data_analysis', + 'collectoss.tasks.util.collection_util'] + +github_tasks = ['collectoss.tasks.github.contributors', + 'collectoss.tasks.github.issues', + 'collectoss.tasks.github.pull_requests.tasks', + 'collectoss.tasks.github.events', + 'collectoss.tasks.github.messages', + 'collectoss.tasks.github.facade_github.tasks', + 'collectoss.tasks.github.releases.tasks', + 'collectoss.tasks.github.repo_info.tasks', + 'collectoss.tasks.github.detect_move.tasks', + 'collectoss.tasks.github.pull_requests.files_model.tasks', + 'collectoss.tasks.github.pull_requests.commits_model.tasks', + 'collectoss.tasks.github.traffic', + 'collectoss.tasks.github.util.populate_repo_src_id'] + +gitlab_tasks = ['collectoss.tasks.gitlab.merge_request_task', + 'collectoss.tasks.gitlab.issues_task', + 'collectoss.tasks.gitlab.events_task'] + +git_tasks = ['collectoss.tasks.git.facade_tasks', + 'collectoss.tasks.git.dependency_tasks.tasks', + 'collectoss.tasks.git.dependency_libyear_tasks.tasks', + 'collectoss.tasks.git.scc_value_tasks.tasks'] + +data_analysis_tasks = ['collectoss.tasks.data_analysis.message_insights.tasks', + 'collectoss.tasks.data_analysis.clustering_worker.tasks', + 'collectoss.tasks.data_analysis.discourse_analysis.tasks', + 'collectoss.tasks.data_analysis.pull_request_analysis_worker.tasks', + 'collectoss.tasks.data_analysis.insight_worker.tasks', + 'collectoss.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker'] + +materialized_view_tasks = ['collectoss.tasks.db.refresh_materialized_views'] + +frontend_tasks = ['collectoss.tasks.frontend'] tasks = start_tasks + github_tasks + gitlab_tasks + git_tasks + materialized_view_tasks + frontend_tasks @@ -74,14 +74,14 @@ #Classes for tasks that take a repo_git as an argument. -class AugurCoreRepoCollectionTask(celery.Task): +class CoreRepoCollectionTask(celery.Task): - def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core',after_fail=CollectionState.ERROR.value): + def handle_celery_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core',after_fail=CollectionState.ERROR.value): # Note: I think self.app.engine would work but leaving it to try later engine = get_engine() - logger = AugurLogger(logger_name).get_logger() + logger = SystemLogger(logger_name).get_logger() logger.error(f"Task {task_id} raised exception: {exc}\n Traceback: {''.join(traceback.format_exception(None, exc, exc.__traceback__))}") @@ -104,7 +104,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h def on_failure(self,exc,task_id,args, kwargs, einfo): repo_git = self._extract_repo_git(args, kwargs) # log traceback to error file - self.augur_handle_task_failure(exc, task_id, repo_git, "core_task_failure") + self.handle_celery_task_failure(exc, task_id, repo_git, "core_task_failure") def _extract_repo_git(self, args, kwargs): if 'repo_git' in kwargs: @@ -121,43 +121,43 @@ def _extract_repo_git(self, args, kwargs): return None -class AugurSecondaryRepoCollectionTask(AugurCoreRepoCollectionTask): +class SecondaryRepoCollectionTask(CoreRepoCollectionTask): def on_failure(self,exc,task_id,args, kwargs, einfo): repo_git = self._extract_repo_git(args, kwargs) - self.augur_handle_task_failure(exc, task_id, repo_git, "secondary_task_failure",collection_hook='secondary') + self.handle_celery_task_failure(exc, task_id, repo_git, "secondary_task_failure",collection_hook='secondary') -class AugurFacadeRepoCollectionTask(AugurCoreRepoCollectionTask): +class FacadeRepoCollectionTask(CoreRepoCollectionTask): def on_failure(self,exc,task_id,args, kwargs, einfo): repo_git = self._extract_repo_git(args, kwargs) - self.augur_handle_task_failure(exc, task_id, repo_git, "facade_task_failure",collection_hook='facade') + self.handle_celery_task_failure(exc, task_id, repo_git, "facade_task_failure",collection_hook='facade') -class AugurMlRepoCollectionTask(AugurCoreRepoCollectionTask): +class MLRepoCollectionTask(CoreRepoCollectionTask): def on_failure(self,exc,task_id,args,kwargs,einfo): repo_git = self._extract_repo_git(args, kwargs) - self.augur_handle_task_failure(exc,task_id,repo_git, "ml_task_failure", collection_hook='ml') + self.handle_celery_task_failure(exc,task_id,repo_git, "ml_task_failure", collection_hook='ml') -#task_cls='augur.tasks.init.celery_app:AugurCoreRepoCollectionTask' +#task_cls='collectoss.tasks.init.celery_app:CoreRepoCollectionTask' celery_app = Celery('tasks', broker=BROKER_URL, backend=BACKEND_URL, include=tasks) # define the queues that tasks will be put in (by default tasks are put in celery queue) celery_app.conf.task_routes = { - 'augur.tasks.start_tasks.*': {'queue': 'scheduling'}, - 'augur.tasks.util.collection_util.*': {'queue': 'scheduling'}, - 'augur.tasks.git.facade_tasks.*': {'queue': 'facade'}, - 'augur.tasks.github.facade_github.tasks.*': {'queue': 'facade'}, - 'augur.tasks.github.pull_requests.commits_model.tasks.*': {'queue': 'secondary'}, - 'augur.tasks.github.pull_requests.files_model.tasks.*': {'queue': 'secondary'}, - 'augur.tasks.github.pull_requests.tasks.collect_pull_request_reviews': {'queue': 'secondary'}, - 'augur.tasks.github.pull_requests.tasks.collect_pull_request_review_comments': {'queue': 'secondary'}, - 'augur.tasks.git.dependency_tasks.tasks.process_ossf_dependency_metrics': {'queue': 'secondary'}, - 'augur.tasks.git.dependency_tasks.tasks.process_dependency_metrics': {'queue': 'facade'}, - 'augur.tasks.git.scc_value_tasks.tasks.process_scc_value_metrics' : {'queue': 'facade'}, - 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'}, - 'augur.tasks.frontend.*': {'queue': 'frontend'}, - 'augur.tasks.data_analysis.contributor_breadth_worker.*': {'queue': 'secondary'}, + 'collectoss.tasks.start_tasks.*': {'queue': 'scheduling'}, + 'collectoss.tasks.util.collection_util.*': {'queue': 'scheduling'}, + 'collectoss.tasks.git.facade_tasks.*': {'queue': 'facade'}, + 'collectoss.tasks.github.facade_github.tasks.*': {'queue': 'facade'}, + 'collectoss.tasks.github.pull_requests.commits_model.tasks.*': {'queue': 'secondary'}, + 'collectoss.tasks.github.pull_requests.files_model.tasks.*': {'queue': 'secondary'}, + 'collectoss.tasks.github.pull_requests.tasks.collect_pull_request_reviews': {'queue': 'secondary'}, + 'collectoss.tasks.github.pull_requests.tasks.collect_pull_request_review_comments': {'queue': 'secondary'}, + 'collectoss.tasks.git.dependency_tasks.tasks.process_ossf_dependency_metrics': {'queue': 'secondary'}, + 'collectoss.tasks.git.dependency_tasks.tasks.process_dependency_metrics': {'queue': 'facade'}, + 'collectoss.tasks.git.scc_value_tasks.tasks.process_scc_value_metrics' : {'queue': 'facade'}, + 'collectoss.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'}, + 'collectoss.tasks.frontend.*': {'queue': 'frontend'}, + 'collectoss.tasks.data_analysis.contributor_breadth_worker.*': {'queue': 'secondary'}, } #Setting to be able to see more detailed states of running tasks @@ -180,18 +180,18 @@ def on_failure(self,exc,task_id,args,kwargs,einfo): -def split_tasks_into_groups(augur_tasks: List[str]) -> Dict[str, List[str]]: +def split_tasks_into_groups(task_list: List[str]) -> Dict[str, List[str]]: """Split tasks on the celery app into groups. Args: - augur_tasks: list of tasks specified in augur + task_list: list of tasks specified in collectoss Returns The tasks so that they are grouped by the module they are defined in """ grouped_tasks = {} - for task in augur_tasks: + for task in task_list: task_divided = task.split(".") try: @@ -218,22 +218,22 @@ def setup_periodic_tasks(sender, **kwargs): The tasks so that they are grouped by the module they are defined in """ from celery.schedules import crontab - from augur.tasks.start_tasks import augur_collection_monitor - from augur.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos, create_collection_status_records - from augur.tasks.git.facade_tasks import clone_repos - from augur.tasks.github.contributors import process_contributors - from augur.tasks.db.refresh_materialized_views import refresh_materialized_views - from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model - from augur.application.db import temporary_database_engine + from collectoss.tasks.start_tasks import collection_monitor + from collectoss.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos, create_collection_status_records + from collectoss.tasks.git.facade_tasks import clone_repos + from collectoss.tasks.github.contributors import process_contributors + from collectoss.tasks.db.refresh_materialized_views import refresh_materialized_views + from collectoss.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model + from collectoss.application.db import temporary_database_engine # Need to engine to be temporary so that there isn't an engine defined when the parent is forked to create worker processes with temporary_database_engine() as engine, DatabaseSession(logger, engine) as session: - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) collection_interval = config.get_value('Tasks', 'collection_interval') logger.info(f"Scheduling collection every {collection_interval/60} minutes") - sender.add_periodic_task(collection_interval, augur_collection_monitor.s()) + sender.add_periodic_task(collection_interval, collection_monitor.s()) #Do longer tasks less often logger.info(f"Scheduling data analysis every 30 days") @@ -248,7 +248,7 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Refresh materialized view task is disabled.") # logger.info(f"Scheduling update of collection weights on midnight each day") - # sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) + # sender.add_periodic_task(crontab(hour=0, minute=0),collection_update_weights.s()) logger.info(f"Setting 404 repos to be marked for retry on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s()) @@ -265,9 +265,9 @@ def setup_loggers(*args,**kwargs): all_celery_tasks = list(current_app.tasks.keys()) - augur_tasks = [task for task in all_celery_tasks if 'celery.' not in task] + tasks = [task for task in all_celery_tasks if 'celery.' not in task] - TaskLogConfig(split_tasks_into_groups(augur_tasks)) + TaskLogConfig(split_tasks_into_groups(tasks)) #engine = None @@ -278,7 +278,7 @@ def init_worker(**kwargs): # global engine - # from augur.application.db.engine import DatabaseEngine + # from collectoss.application.db.engine import DatabaseEngine # from sqlalchemy.pool import NullPool, StaticPool # engine = DatabaseEngine(poolclass=StaticPool).engine @@ -287,7 +287,7 @@ def init_worker(**kwargs): @worker_process_shutdown.connect def shutdown_worker(**kwargs): - from augur.application.db import dispose_database_engine + from collectoss.application.db import dispose_database_engine dispose_database_engine() # global engine diff --git a/augur/tasks/init/redis_connection.py b/collectoss/tasks/init/redis_connection.py similarity index 87% rename from augur/tasks/init/redis_connection.py rename to collectoss/tasks/init/redis_connection.py index 33e0aec3f..9dac45f3f 100644 --- a/augur/tasks/init/redis_connection.py +++ b/collectoss/tasks/init/redis_connection.py @@ -1,6 +1,6 @@ """Defines the redis connection.""" import redis -from augur.tasks.init import get_redis_conn_values +from collectoss.tasks.init import get_redis_conn_values def get_redis_connection(): diff --git a/augur/tasks/start_tasks.py b/collectoss/tasks/start_tasks.py similarity index 85% rename from augur/tasks/start_tasks.py rename to collectoss/tasks/start_tasks.py index 91e05c6fc..644b6cbc4 100644 --- a/augur/tasks/start_tasks.py +++ b/collectoss/tasks/start_tasks.py @@ -4,32 +4,33 @@ #from celery.result import AsyncResult from celery import group, chain from sqlalchemy import and_,update +import sqlalchemy as s -from augur.tasks.github import * +from collectoss.tasks.github import * if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": - from augur.tasks.data_analysis import * -from augur.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary -from augur.tasks.github.releases.tasks import collect_releases -from augur.tasks.github.repo_info.tasks import collect_repo_info, collect_linux_badge_info -from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files -from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits -from augur.tasks.github.util.populate_repo_src_id import populate_repo_src_id_task -from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics -from augur.tasks.github.traffic import collect_github_repo_clones_data -from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_metadata, collect_merge_request_commits, collect_merge_request_files, collect_merge_request_comments -from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments -from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events -from augur.tasks.git.facade_tasks import * -from augur.tasks.db.refresh_materialized_views import * -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.models import CollectionStatus, Repo -from augur.tasks.util.collection_state import CollectionState -from augur.tasks.util.collection_util import * -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor -from augur.application.db.lib import execute_sql, get_session -from augur.application.config import AugurConfig + from collectoss.tasks.data_analysis import * +from collectoss.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary +from collectoss.tasks.github.releases.tasks import collect_releases +from collectoss.tasks.github.repo_info.tasks import collect_repo_info, collect_linux_badge_info +from collectoss.tasks.github.pull_requests.files_model.tasks import process_pull_request_files +from collectoss.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits +from collectoss.tasks.github.util.populate_repo_src_id import populate_repo_src_id_task +from collectoss.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics +from collectoss.tasks.github.traffic import collect_github_repo_clones_data +from collectoss.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_metadata, collect_merge_request_commits, collect_merge_request_files, collect_merge_request_comments +from collectoss.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments +from collectoss.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events +from collectoss.tasks.git.facade_tasks import * +from collectoss.tasks.db.refresh_materialized_views import * +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.session import DatabaseSession +from collectoss.application.db.models import CollectionStatus, Repo +from collectoss.tasks.util.collection_state import CollectionState +from collectoss.tasks.util.collection_util import * +from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor +from collectoss.application.db.lib import execute_sql, get_session +from collectoss.application.config import SystemConfig RUNNING_DOCKER = os.environ.get('AUGUR_DOCKER_DEPLOY') == "1" @@ -59,7 +60,7 @@ def prelim_phase_secondary(repo_git, full_collection): return detect_github_repo_move_secondary.si(repo_git) -#This is the phase that defines the message for core augur collection +#This is the phase that defines the message for core collectoss collection #A chain is needed for each repo. def primary_repo_collect_phase(repo_git, full_collection): logger = logging.getLogger(primary_repo_collect_phase.__name__) @@ -147,7 +148,7 @@ def non_repo_domain_tasks(self): if not RUNNING_DOCKER and machine_learning_phase.__name__ in enabled_phase_names: #enabled_tasks.extend(machine_learning_phase()) - from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model + from collectoss.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model enabled_tasks.append(contributor_breadth_model.si()) tasks = chain( @@ -237,11 +238,11 @@ def ml_task_success_util_gen(repo_git, full_collection): return request @celery.task(bind=True) -def augur_collection_monitor(self): +def collection_monitor(self): engine = self.app.engine - logger = logging.getLogger(augur_collection_monitor.__name__) + logger = logging.getLogger(collection_monitor.__name__) logger.info("Checking for repos to collect") @@ -254,7 +255,7 @@ def augur_collection_monitor(self): with DatabaseSession(logger, self.app.engine) as session: # Get config values for collection intervals - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) core_interval = config.get_value('Tasks', 'core_collection_interval_days') or 15 secondary_interval = config.get_value('Tasks', 'secondary_collection_interval_days') or 10 facade_interval = config.get_value('Tasks', 'facade_collection_interval_days') or 10 @@ -277,7 +278,7 @@ def augur_collection_monitor(self): logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") - main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) + main_routine = CollectionTaskRoutine(logger, enabled_collection_hooks) main_routine.start_data_collection() @@ -285,11 +286,11 @@ def augur_collection_monitor(self): @celery.task(bind=True) -def augur_collection_update_weights(self): +def collection_update_weights(self): engine = self.app.engine - logger = logging.getLogger(augur_collection_update_weights.__name__) + logger = logging.getLogger(collection_update_weights.__name__) logger.info("Updating stale collection weights") @@ -338,7 +339,7 @@ def retry_errored_repos(self): logger = logging.getLogger(create_collection_status_records.__name__) #TODO: Isaac needs to normalize the status's to be abstract in the - #collection_status table once augur dev is less unstable. + #collection_status table once collectoss dev is less unstable dev is less unstable. query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is NULL;""" f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" diff --git a/augur/tasks/test.py b/collectoss/tasks/test.py similarity index 63% rename from augur/tasks/test.py rename to collectoss/tasks/test.py index efdacb77f..f5e41433a 100644 --- a/augur/tasks/test.py +++ b/collectoss/tasks/test.py @@ -1,4 +1,4 @@ -from augur.tasks.init.celery_app import celery_app as celery +from collectoss.tasks.init.celery_app import celery_app as celery @celery.task() def successful_task(): diff --git a/augur/tasks/util/AugurUUID.py b/collectoss/tasks/util/ContributorUUID.py similarity index 94% rename from augur/tasks/util/AugurUUID.py rename to collectoss/tasks/util/ContributorUUID.py index ae8f05f12..f88fb8850 100644 --- a/augur/tasks/util/AugurUUID.py +++ b/collectoss/tasks/util/ContributorUUID.py @@ -3,7 +3,7 @@ import uuid import typing -class AugurUUID: +class ContributorUUID: struct = { "platform": {"start": 0, "size": 1} } @@ -89,13 +89,13 @@ def __setitem__(self, key: str, value: int): self.write_int(value, structure["start"], structure["size"]) # Referencing a class type within itself like this requies the annotations import from above - def __eq__(self, other: AugurUUID)-> bool: + def __eq__(self, other: ContributorUUID)-> bool: return self.bytes == other.bytes - def __lt__(self, other: AugurUUID)-> bool: + def __lt__(self, other: ContributorUUID)-> bool: return int(self) < int(other) - def __gt__(self, other: AugurUUID)-> bool: + def __gt__(self, other: ContributorUUID)-> bool: return int(self) > int(other) def __len__(self)-> int: @@ -116,7 +116,7 @@ def __str__(self)-> str: def __iter__(self): return (byte for byte in self.bytes) -class GithubUUID(AugurUUID): +class GithubUUID(ContributorUUID): struct = { "platform": {"start": 0, "size": 1}, "user": {"start": 1, "size": 4}, @@ -129,7 +129,7 @@ class GithubUUID(AugurUUID): def __init__(self): super().__init__(platform = 1) -class GitlabUUID(AugurUUID): +class GitlabUUID(ContributorUUID): struct = { "platform": {"start": 0, "size": 1}, "user": {"start": 1, "size": 4}, @@ -161,7 +161,7 @@ def __init__(self): # print(uid) - # id = AugurUUID() + # id = ContributorUUID() # # print(id) # @@ -176,4 +176,4 @@ def __init__(self): # # print(id[0]) # - # print(AugurUUID(17, 1, 1).bytes) + # print(ContributorUUID(17, 1, 1).bytes) diff --git a/augur/tasks/util/__init__.py b/collectoss/tasks/util/__init__.py similarity index 100% rename from augur/tasks/util/__init__.py rename to collectoss/tasks/util/__init__.py diff --git a/augur/tasks/util/collection_state.py b/collectoss/tasks/util/collection_state.py similarity index 100% rename from augur/tasks/util/collection_state.py rename to collectoss/tasks/util/collection_state.py diff --git a/augur/tasks/util/collection_util.py b/collectoss/tasks/util/collection_util.py similarity index 92% rename from augur/tasks/util/collection_util.py rename to collectoss/tasks/util/collection_util.py index 353ba1ba4..18009d207 100644 --- a/augur/tasks/util/collection_util.py +++ b/collectoss/tasks/util/collection_util.py @@ -6,17 +6,17 @@ from celery import chain import sqlalchemy as s from sqlalchemy import or_, update -from augur.application.logs import AugurLogger -from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.models import CollectionStatus, Repo -from augur.application.db.util import execute_session_query -from augur.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue -from augur.application.db import get_engine -from augur.application.db.lib import execute_sql, get_session, get_active_repo_count, get_repo_by_repo_git -from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps -from augur.tasks.util.collection_state import CollectionState -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from collectoss.application.logs import SystemLogger +from collectoss.tasks.init.celery_app import celery_app as celery +from collectoss.application.db.models import CollectionStatus, Repo +from collectoss.application.db.util import execute_session_query +from collectoss.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue +from collectoss.application.db import get_engine +from collectoss.application.db.lib import execute_sql, get_session, get_active_repo_count, get_repo_by_repo_git +from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps +from collectoss.tasks.util.collection_state import CollectionState +from collectoss.application.db.session import DatabaseSession +from collectoss.application.config import SystemConfig class CollectionRequest: @@ -117,7 +117,7 @@ def get_enabled_phase_names_from_config(engine, logger): def get_enabled_phase_names_from_config_session(session, logger): - config = AugurConfig(logger, session) + config = SystemConfig(logger, session) return get_enabled_phase_names_from_config_object(config) @@ -412,7 +412,7 @@ def facade_clone_success_util(self, repo_git): session.commit() -class AugurCollectionTotalRepoWeight: +class CollectionTotalRepoWeight: """ small class to encapsulate the weight calculation of each repo that is being scheduled. Intended to be used as a counter where while it is greater than @@ -422,12 +422,12 @@ class AugurCollectionTotalRepoWeight: Attributes: - logger (Logger): Get logger from AugurLogger + logger (Logger): Get logger from SystemLogger value (int): current value of the collection weight value_weight_calculation (function): Function to use on repo to determine weight """ def __init__(self,starting_value: int, weight_calculation=get_repo_weight_core): - self.logger = AugurLogger("data_collection_jobs").get_logger() + self.logger = SystemLogger("data_collection_jobs").get_logger() self.value = starting_value self.value_weight_calculation = weight_calculation @@ -437,7 +437,7 @@ def __sub__(self, other): if isinstance(other, int): self.value -= other - elif isinstance(other, AugurCollectionTotalRepoWeight): + elif isinstance(other, CollectionTotalRepoWeight): self.value -= other.value elif isinstance(other, Repo): repo_weight = self.value_weight_calculation(self.logger,other.repo_git) @@ -454,7 +454,7 @@ def __sub__(self, other): return self -class AugurTaskRoutine: +class CollectionTaskRoutine: """ class to keep track of various groups of collection tasks for a group of repos. Simple version to just schedule a number of repos not worrying about repo weight. @@ -463,9 +463,9 @@ class to keep track of various groups of collection tasks for a group of repos. Attributes: - logger (Logger): Get logger from AugurLogger + logger (Logger): Get logger from SystemLogger repos (List[str]): List of repo_ids to run collection on. - collection_phases (List[str]): List of phases to run in augur collection. + collection_phases (List[str]): List of phases to run in collectoss collection. collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core session: Database session to use """ @@ -503,9 +503,7 @@ def start_data_collection(self): for repo_git, task_id, hook_name in self.send_messages(): self.update_status_and_id(repo_git,task_id,hook_name, session) - def send_messages(self): - augur_collection_list = [] - + def send_messages(self): for col_hook in self.collection_hooks: self.logger.info(f"Starting collection on {len(col_hook.repo_list)} {col_hook.name} repos") @@ -527,16 +525,16 @@ def send_messages(self): continue phases = col_hook.gitlab_phases - augur_collection_sequence = [] + collection_sequence = [] for job in phases: #Add the phase to the sequence in order as a celery task. #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git, full_collection)) + collection_sequence.append(job(repo_git, full_collection)) - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #collection_sequence.append(core_task_success_util.si(repo_git)) #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id + collection_chain = chain(*collection_sequence) + task_id = collection_chain.apply_async().task_id self.logger.info(f"Setting {platform_name} repo {col_hook.name} status to collecting for repo: {repo_git}") diff --git a/augur/tasks/util/metadata_exception.py b/collectoss/tasks/util/metadata_exception.py similarity index 100% rename from augur/tasks/util/metadata_exception.py rename to collectoss/tasks/util/metadata_exception.py diff --git a/augur/tasks/util/random_key_auth.py b/collectoss/tasks/util/random_key_auth.py similarity index 97% rename from augur/tasks/util/random_key_auth.py rename to collectoss/tasks/util/random_key_auth.py index dc59544ae..409b5889c 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/collectoss/tasks/util/random_key_auth.py @@ -4,7 +4,7 @@ from httpx import Auth, Request, Response from random import choice import hashlib -from augur.util.keys import mask_key +from collectoss.util.keys import mask_key class RandomKeyAuth(Auth): diff --git a/augur/tasks/util/redis_list.py b/collectoss/tasks/util/redis_list.py similarity index 98% rename from augur/tasks/util/redis_list.py rename to collectoss/tasks/util/redis_list.py index 933b041a1..ea44ebb4a 100644 --- a/augur/tasks/util/redis_list.py +++ b/collectoss/tasks/util/redis_list.py @@ -4,8 +4,8 @@ from typing import Iterable, Any, Union from collections.abc import MutableSequence -from augur.tasks.init.redis_connection import get_redis_connection -from augur import instance_id +from collectoss.tasks.init.redis_connection import get_redis_connection +from collectoss import instance_id class RedisList(MutableSequence): diff --git a/augur/tasks/util/redis_scalar.py b/collectoss/tasks/util/redis_scalar.py similarity index 91% rename from augur/tasks/util/redis_scalar.py rename to collectoss/tasks/util/redis_scalar.py index 45579689d..29a8bb85c 100644 --- a/augur/tasks/util/redis_scalar.py +++ b/collectoss/tasks/util/redis_scalar.py @@ -4,8 +4,8 @@ from typing import Iterable, Any, Union from collections.abc import MutableSequence -from augur.tasks.init.redis_connection import get_redis_connection -from augur import instance_id +from collectoss.tasks.init.redis_connection import get_redis_connection +from collectoss import instance_id from redis import exceptions import numbers diff --git a/augur/tasks/util/worker_util.py b/collectoss/tasks/util/worker_util.py similarity index 98% rename from augur/tasks/util/worker_util.py rename to collectoss/tasks/util/worker_util.py index 5ec2e6eeb..2c5943560 100644 --- a/augur/tasks/util/worker_util.py +++ b/collectoss/tasks/util/worker_util.py @@ -11,7 +11,7 @@ import json import subprocess -from augur.tasks.util.metadata_exception import MetadataException +from collectoss.tasks.util.metadata_exception import MetadataException def create_grouped_task_load(*args,processes=8,dataList=[],task=None): diff --git a/augur/templates/admin-dashboard.j2 b/collectoss/templates/admin-dashboard.j2 similarity index 98% rename from augur/templates/admin-dashboard.j2 rename to collectoss/templates/admin-dashboard.j2 index a24829c99..ea4100d49 100644 --- a/augur/templates/admin-dashboard.j2 +++ b/collectoss/templates/admin-dashboard.j2 @@ -18,7 +18,7 @@ - Dasboard - Augur View + Dasboard - CollectOSS View @@ -45,7 +45,7 @@