name: Scrape transfer and update file run-name: Scrape transfer and update file on: workflow_dispatch: schedule: - cron: '0 10 * * *' jobs: setup: name: Get number of pages and set up scrape page jobs runs-on: ubuntu-latest steps: - name: Create matrix parameters id: matrix-params run: | NUM_PAGES="$(curl -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0' 'https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce' | grep -e 'lblInstWithEQPaginationInfo' | grep -Poie '(?<=of )[0-9]*')" MATRIX_PARAMS="$(seq -s "," 1 "$NUM_PAGES")" MATRIX_PARAMS="\"page\": $(sed -e 's/,/}, {"page": /g' <<< "$MATRIX_PARAMS")" echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT outputs: matrix-params: ${{ steps.matrix-params.outputs.matrix-params }} scrape-page: name: Scrape page runs-on: ubuntu-latest needs: setup strategy: matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }} fail-fast: true steps: - name: Checkout scraping repo uses: actions/checkout@v4 - name: Set up python uses: actions/setup-python@v5 with: python-version: '3.11' cache: 'pip' - name: Install dependencies working-directory: transfer_scraper run: | python -m pip install --upgrade pip pip install -r 'requirements.txt' - name: Run scraper working-directory: transfer_scraper run: | python3 scrape_page.py ${{ matrix.page }} transfer_${{ matrix.page }}.json - name: Upload data to artifact uses: actions/upload-artifact@v4 with: name: transfer-page-${{ matrix.page }} path: transfer_scraper/transfer_${{ matrix.page }}.json commit-data: name: Combine/convert and commit data runs-on: ubuntu-latest needs: scrape-page steps: - name: Set up python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Checkout data repo uses: actions/checkout@v4 with: repository: quatalog/data path: data token: ${{ secrets.PUSH_TOKEN }} - name: Checkout scraping repo uses: actions/checkout@v4 with: path: scrapers - name: Download partial JSONs uses: actions/download-artifact@v4 with: pattern: transfer-page-* merge-multiple: true path: new-data - name: Combine JSONs run: | cat new-data/* | jq -s 'add | sort_by(.institution)' > data/transfer.json - name: Commit data working-directory: data run: | git config user.name "Quatalog Updater" git config user.email "github_actions@quatalog.com" git add transfer.json git commit -m "$(date)" || exit 0 git push - name: Run CSV generator run: | curl -H "Accept: application/vnd.github.everest-preview+json" \ -H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \ --request POST --data '{"event_type": "generate-csv"}' \ "https://api.github.com/repos/quatalog/quatalog/dispatches"