Skip to content

Generate CDSE Documentation Search Index #46

Generate CDSE Documentation Search Index

Generate CDSE Documentation Search Index #46

name: Generate CDSE Documentation Search Index
on:
schedule:
- cron: '0 0 * * *' # Every midnight
workflow_dispatch:
inputs:
start_url:
description: 'Starting URL for crawling (default: https://documentation.dataspace.copernicus.eu/)'
required: false
default: 'https://documentation.dataspace.copernicus.eu/'
max_minutes:
description: 'Maximum crawling time in minutes (default: 15)'
required: false
default: '15'
concurrency: generate-search-index
jobs:
generate-index:
runs-on: ubuntu-latest
steps:
- name: Check out repository
uses: actions/checkout@v3
- name: Set Git config
run: |
git config --local user.email "actions@github.com"
git config --local user.name "Github Actions"
- name: Create git subtree
run: git subtree add --prefix search_index origin docportal-search-index
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.14'
cache: 'pip'
- name: Install dependencies
run: pip install -r .github/scripts/requirements.txt
- name: Generate search index
env:
START_URL: ${{ github.event.inputs.start_url || 'https://documentation.dataspace.copernicus.eu/' }}
MAX_MINUTES: ${{ github.event.inputs.max_minutes || '15' }}
run: |
python .github/scripts/run.py scrape --start-url "$START_URL" --max-minutes "$MAX_MINUTES"
- name: Commit
run: |
git add search_index
if git diff --cached --quiet; then
echo "No changes in search_index to commit"
else
git commit -m "search - update of index"
git subtree push --prefix search_index origin docportal-search-index
fi