diff --git a/.github/workflows/transfer.yml b/.github/workflows/transfer.yml index 982b004..41716fa 100644 --- a/.github/workflows/transfer.yml +++ b/.github/workflows/transfer.yml @@ -59,6 +59,11 @@ jobs: runs-on: ubuntu-latest needs: scrape-page steps: + - name: Set up python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Checkout data repo uses: actions/checkout@v4 with: @@ -66,11 +71,6 @@ jobs: path: data token: ${{ secrets.PUSH_TOKEN }} - - name: Set up python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - name: Checkout scraping repo uses: actions/checkout@v4 with: @@ -87,15 +87,117 @@ jobs: run: | cat new-data/* | jq -s 'add | sort_by(.institution)' > data/transfer.json - - name: Convert to by-courses format - run: | - python scrapers/transfer_scraper/convert_json.py data/transfer.json data/transfer_by_course.json - - name: Commit data working-directory: data run: | git config user.name "Quatalog Updater" git config user.email "github_actions@quatalog.com" - git add transfer*.json + git add transfer.json git commit -m "$(date)" || exit 0 git push + + - name: + name: Scrape transfer and update file +run-name: Scrape transfer and update file +on: + workflow_dispatch: + schedule: + - cron: '0 10 * * *' + +jobs: + setup: + name: Get number of pages and set up scrape page jobs + runs-on: ubuntu-latest + steps: + - name: Create matrix parameters + id: matrix-params + run: | + NUM_PAGES="$(curl -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0' 'https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce' | grep -e 'lblInstWithEQPaginationInfo' | grep -Poie '(?<=of )[0-9]*')" + MATRIX_PARAMS="$(seq -s "," 1 "$NUM_PAGES")" + MATRIX_PARAMS="\"page\": $(sed -e 's/,/}, {"page": /g' <<< "$MATRIX_PARAMS")" + echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT + outputs: + matrix-params: ${{ steps.matrix-params.outputs.matrix-params }} + + scrape-page: + name: Scrape page + runs-on: ubuntu-latest + needs: setup + strategy: + matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }} + fail-fast: true + steps: + - name: Checkout scraping repo + uses: actions/checkout@v4 + + - name: Set up python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + working-directory: transfer_scraper + run: | + python -m pip install --upgrade pip + pip install -r 'requirements.txt' + + - name: Run scraper + working-directory: transfer_scraper + run: | + python3 scrape_page.py ${{ matrix.page }} transfer_${{ matrix.page }}.json + + - name: Upload data to artifact + uses: actions/upload-artifact@v4 + with: + name: transfer-page-${{ matrix.page }} + path: transfer_scraper/transfer_${{ matrix.page }}.json + + commit-data: + name: Combine/convert and commit data + runs-on: ubuntu-latest + needs: scrape-page + steps: + - name: Set up python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Checkout data repo + uses: actions/checkout@v4 + with: + repository: quatalog/data + path: data + token: ${{ secrets.PUSH_TOKEN }} + + - name: Checkout scraping repo + uses: actions/checkout@v4 + with: + path: scrapers + + - name: Download partial JSONs + uses: actions/download-artifact@v4 + with: + pattern: transfer-page-* + merge-multiple: true + path: new-data + + - name: Combine JSONs + run: | + cat new-data/* | jq -s 'add | sort_by(.institution)' > data/transfer.json + + - name: Commit data + working-directory: data + run: | + git config user.name "Quatalog Updater" + git config user.email "github_actions@quatalog.com" + git add transfer.json + git commit -m "$(date)" || exit 0 + git push + + - name: Run CSV generator + run: | + curl -H "Accept: application/vnd.github.everest-preview+json" \ + -H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \ + --request POST --data '{"event_type": "generate-csv"}' \ + "https://api.github.com/repos/quatalog/quatalog/dispatches"