name: Scrape transfer and update file run-name: Scrape transfer and update file on: workflow_dispatch: schedule: - cron: '0 10 * * *' jobs: setup: name: Get number of pages and set up scrape page jobs runs-on: ubuntu-latest steps: - name: Create matrix parameters id: matrix-params run: | NUM_PAGES="$(curl -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0' 'https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce' | grep -e 'lblInstWithEQPaginationInfo' | grep -Poie '(?<=of )[0-9]*')" MATRIX_PARAMS="$(seq -s "," 1 "$NUM_PAGES")" MATRIX_PARAMS="\"page\": $(sed -e 's/,/}, {"page": /g' <<< "$MATRIX_PARAMS")" echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT outputs: matrix-params: ${{ steps.matrix-params.outputs.matrix-params }} scrape-page: name: Scrape page needs: setup uses: ./.github/workflows/scrape_page.yml strategy: matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }} fail-fast: true with: page: ${{ matrix.page }} commit-data: name: Combine/convert and commit data runs-on: ubuntu-latest needs: scrape-page steps: - name: Set up python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Checkout data repo uses: actions/checkout@v4 with: repository: quatalog/data path: data token: ${{ secrets.PUSH_TOKEN }} - name: Checkout scraping repo uses: actions/checkout@v4 with: path: scrapers - name: Download partial JSONs uses: actions/download-artifact@v4 with: pattern: transfer-page-* merge-multiple: true path: new-data - name: Combine JSONs run: | cat new-data/* | jq -s 'add | sort_by(.institution)' > data/transfer.json - name: Commit data working-directory: data run: | git config user.name "Quatalog Updater" git config user.email "github_actions@quatalog.com" git add transfer.json git commit -m "$(date)" || exit 0 git push - name: Trigger CSV generator uses: peter-evans/repository-dispatch@v3 with: event-type: generate-csv