name: Scrape transfer and update file run-name: Scrape transfer and update file on: workflow_dispatch: # schedule: # - cron: '0 4 * * *' jobs: setup: name: Get number of pages and set up scrape page jobs runs-on: ubuntu-latest steps: - name: Create matrix parameters id: matrix-params run: | UA='User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0' URL='https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce' curl -s -S -H "$UA" "$URL" | tr -d '\0' > home.html echo "======================================================================" if [[ "$(cat "home.html" | grep "Oops! Page currently not available (")" ]]; then echo "Transfer guide unavailable" exit -1 fi echo "======================================================================" cat "home.html" | grep -A2 "" echo "======================================================================" NUM_PAGES="$(cat "home.html" | grep -e 'lblInstWithEQPaginationInfo' | grep -Poie '(?<=of )[0-9]*')" echo "Found $NUM_PAGES pages" MATRIX_PARAMS="$(seq -s "," 1 "$NUM_PAGES")" MATRIX_PARAMS="\"page\": $(sed -e 's/,/}, {"page": /g' <<< "$MATRIX_PARAMS")" echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT outputs: matrix-params: ${{ steps.matrix-params.outputs.matrix-params }} scrape-page: name: Scrape page needs: setup uses: ./.github/workflows/scrape_page.yml strategy: matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }} fail-fast: true with: page: ${{ matrix.page }} commit-data: name: Combine/convert and commit data runs-on: ubuntu-latest needs: scrape-page steps: - name: Set up python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Checkout data repo uses: actions/checkout@v4 with: repository: quatalog/data path: data token: ${{ secrets.PUSH_TOKEN }} - name: Checkout scraping repo uses: actions/checkout@v4 with: path: scrapers - name: Download partial JSONs uses: actions/download-artifact@v4 with: pattern: transfer-page-* merge-multiple: true path: new-data - name: Combine JSONs run: | cat new-data/* | jq -s 'add | sort_by(.institution)' > data/transfer.json - name: Commit data working-directory: data run: | git config user.name "Quatalog Updater" git config user.email "github_actions@quatalog.com" git add transfer.json git commit -m "$(date)" || exit 0 git push - name: Trigger CSV generator run: | curl -H "Accept: application/vnd.github.everest-preview+json" \ -H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \ -X POST --data '{"event_type": "generate-csv"}' \ 'https://api.github.com/repos/quatalog/quatalog/dispatches'