2024-02-29 22:16:39 +00:00
|
|
|
name: Scrape transfer and update file
|
2024-03-01 06:45:06 +00:00
|
|
|
run-name: Scrape transfer and update file
|
2024-02-29 22:16:39 +00:00
|
|
|
on:
|
|
|
|
workflow_dispatch:
|
2024-03-06 08:43:41 +00:00
|
|
|
schedule:
|
2024-04-10 22:43:08 +00:00
|
|
|
- cron: '0 10 */2 * *'
|
2024-02-29 22:16:39 +00:00
|
|
|
|
2024-03-16 06:31:20 +00:00
|
|
|
jobs:
|
|
|
|
setup:
|
|
|
|
name: Get number of pages and set up scrape page jobs
|
|
|
|
runs-on: ubuntu-latest
|
|
|
|
steps:
|
|
|
|
- name: Create matrix parameters
|
|
|
|
id: matrix-params
|
|
|
|
run: |
|
2024-04-08 18:32:48 +00:00
|
|
|
UA='User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0'
|
|
|
|
URL='https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce'
|
2024-04-21 22:31:39 +00:00
|
|
|
curl -s -S -H "$UA" "$URL" | tr -d '\0' > home.html
|
|
|
|
echo "======================================================================"
|
|
|
|
if [ -z "$(cat "home.html" | grep -A2 "Oops! Page currently not available (")" ]; then
|
|
|
|
echo "Transfer guide unavailable"
|
|
|
|
exit -1
|
|
|
|
fi
|
|
|
|
echo "======================================================================"
|
2024-04-21 22:14:35 +00:00
|
|
|
cat "home.html" | grep -A2 "<title>"
|
2024-04-21 22:31:39 +00:00
|
|
|
echo "======================================================================"
|
2024-04-21 22:14:35 +00:00
|
|
|
NUM_PAGES="$(cat "home.html" | grep -e 'lblInstWithEQPaginationInfo' | grep -Poie '(?<=of )[0-9]*')"
|
2024-04-08 18:33:44 +00:00
|
|
|
echo "Found $NUM_PAGES pages"
|
2024-03-18 00:07:48 +00:00
|
|
|
MATRIX_PARAMS="$(seq -s "," 1 "$NUM_PAGES")"
|
2024-03-16 06:31:20 +00:00
|
|
|
MATRIX_PARAMS="\"page\": $(sed -e 's/,/}, {"page": /g' <<< "$MATRIX_PARAMS")"
|
|
|
|
echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT
|
|
|
|
outputs:
|
|
|
|
matrix-params: ${{ steps.matrix-params.outputs.matrix-params }}
|
|
|
|
|
|
|
|
scrape-page:
|
|
|
|
name: Scrape page
|
|
|
|
needs: setup
|
2024-03-17 00:45:57 +00:00
|
|
|
uses: ./.github/workflows/scrape_page.yml
|
2024-03-16 06:31:20 +00:00
|
|
|
strategy:
|
|
|
|
matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }}
|
|
|
|
fail-fast: true
|
2024-03-17 00:45:57 +00:00
|
|
|
with:
|
|
|
|
page: ${{ matrix.page }}
|
2024-03-16 06:31:20 +00:00
|
|
|
|
|
|
|
commit-data:
|
|
|
|
name: Combine/convert and commit data
|
|
|
|
runs-on: ubuntu-latest
|
|
|
|
needs: scrape-page
|
|
|
|
steps:
|
|
|
|
- name: Set up python
|
|
|
|
uses: actions/setup-python@v5
|
|
|
|
with:
|
|
|
|
python-version: '3.11'
|
|
|
|
|
|
|
|
- name: Checkout data repo
|
|
|
|
uses: actions/checkout@v4
|
|
|
|
with:
|
|
|
|
repository: quatalog/data
|
|
|
|
path: data
|
|
|
|
token: ${{ secrets.PUSH_TOKEN }}
|
2024-03-16 05:28:33 +00:00
|
|
|
|
|
|
|
- name: Checkout scraping repo
|
|
|
|
uses: actions/checkout@v4
|
|
|
|
with:
|
|
|
|
path: scrapers
|
|
|
|
|
2024-03-06 06:43:26 +00:00
|
|
|
- name: Download partial JSONs
|
|
|
|
uses: actions/download-artifact@v4
|
|
|
|
with:
|
|
|
|
pattern: transfer-page-*
|
|
|
|
merge-multiple: true
|
|
|
|
path: new-data
|
|
|
|
|
|
|
|
- name: Combine JSONs
|
|
|
|
run: |
|
2024-03-06 18:10:23 +00:00
|
|
|
cat new-data/* | jq -s 'add | sort_by(.institution)' > data/transfer.json
|
2024-03-16 05:28:33 +00:00
|
|
|
|
2024-03-06 06:43:26 +00:00
|
|
|
- name: Commit data
|
|
|
|
working-directory: data
|
|
|
|
run: |
|
|
|
|
git config user.name "Quatalog Updater"
|
|
|
|
git config user.email "github_actions@quatalog.com"
|
2024-03-16 06:31:20 +00:00
|
|
|
git add transfer.json
|
2024-03-06 06:43:26 +00:00
|
|
|
git commit -m "$(date)" || exit 0
|
|
|
|
git push
|
2024-03-16 06:31:20 +00:00
|
|
|
|
2024-03-16 17:07:22 +00:00
|
|
|
- name: Trigger CSV generator
|
2024-03-17 14:38:39 +00:00
|
|
|
run: |
|
|
|
|
curl -H "Accept: application/vnd.github.everest-preview+json" \
|
|
|
|
-H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \
|
|
|
|
-X POST --data '{"event_type": "generate-csv"}' \
|
|
|
|
'https://api.github.com/repos/quatalog/quatalog/dispatches'
|