Compare commits

...

5 Commits

Author SHA1 Message Date
powe97 99ba8873bd
Run twice a day 2024-03-16 21:25:42 -04:00
powe97 eb77e84535
Fix artifact upload 2024-03-16 21:23:23 -04:00
powe97 bdc6b2bcbc
Short circuit on IP ban 2024-03-16 21:16:14 -04:00
powe97 ecfb176c46
Create scrape_page.yml 2024-03-16 20:46:25 -04:00
powe97 ea09d33ac2
Convert to use reusable workflow 2024-03-16 20:45:57 -04:00
3 changed files with 52 additions and 28 deletions

38
.github/workflows/scrape_page.yml vendored Normal file
View File

@ -0,0 +1,38 @@
name: Scrape single page of transfer guide
run-name: Scrape single page of transfer guide
on:
workflow_call:
inputs:
page:
required: true
type: number
jobs:
scrape-page:
runs-on: ubuntu-latest
steps:
- name: Checkout scraping repo
uses: actions/checkout@v4
- name: Set up python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Install dependencies
working-directory: transfer_scraper
run: |
python -m pip install --upgrade pip
pip install -r 'requirements.txt'
- name: Run scraper
working-directory: transfer_scraper
run: |
python3 scrape_page.py ${{ inputs.page }} transfer_${{ inputs.page }}.json
- name: Upload data to artifact
uses: actions/upload-artifact@v4
with:
name: transfer-page-${{ inputs.page }}
path: transfer_scraper/transfer_${{ inputs.page }}.json

View File

@ -3,7 +3,7 @@ run-name: Scrape transfer and update file
on:
workflow_dispatch:
schedule:
- cron: '0 10 * * *'
- cron: '0 10,22 * * *'
jobs:
setup:
@ -22,37 +22,13 @@ jobs:
scrape-page:
name: Scrape page
runs-on: ubuntu-latest
needs: setup
uses: ./.github/workflows/scrape_page.yml
strategy:
matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }}
fail-fast: true
steps:
- name: Checkout scraping repo
uses: actions/checkout@v4
- name: Set up python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Install dependencies
working-directory: transfer_scraper
run: |
python -m pip install --upgrade pip
pip install -r 'requirements.txt'
- name: Run scraper
working-directory: transfer_scraper
run: |
python3 scrape_page.py ${{ matrix.page }} transfer_${{ matrix.page }}.json
- name: Upload data to artifact
uses: actions/upload-artifact@v4
with:
name: transfer-page-${{ matrix.page }}
path: transfer_scraper/transfer_${{ matrix.page }}.json
with:
page: ${{ matrix.page }}
commit-data:
name: Combine/convert and commit data

View File

@ -16,6 +16,10 @@ from selenium.common.exceptions import (
)
class IPBanException(Exception):
pass
# Fix course titles accounting for Roman numerals up to X
def normalize_title(input):
s = " ".join(input.split())
@ -103,8 +107,14 @@ def scrape_page(page_num):
)
wait(EC.visibility_of_element_located((By.TAG_NAME, "body")))
print(f'Title: "{driver.title}"', file=sys.stderr)
if driver.title == "403 Forbidden":
raise IPBanException
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
break
except IPBanException as e:
driver.quit()
print(f"We are IP-banned, exiting now", file=sys.stderr)
raise e
except Exception as e:
driver.quit()