Compare commits

...

9 Commits

Author SHA1 Message Date
powe97 aec272e28e
Run once a day 2024-03-06 02:43:41 -06:00
powe97 c21dec05ad
Re-enable fail-fast 2024-03-06 02:36:33 -06:00
powe97 81ba2fdc80
Make failing actually fail the program 2024-03-06 02:35:18 -06:00
powe97 92c3327b1a
Fix debug prints 2024-03-06 02:09:42 -06:00
powe97 6a1395c054
Fix workflow not being able to commit and remove tee 2024-03-06 02:05:49 -06:00
powe97 912b07f6f3
Add retrying first page 2024-03-06 01:18:49 -06:00
powe97 8b15438a98
Actually use the retry version of the function... 2024-03-06 01:03:23 -06:00
powe97 0007bde18a
Recombine JSONs 2024-03-06 00:43:26 -06:00
powe97 c98b928125
Add retrying 2024-03-05 22:54:42 -05:00
2 changed files with 68 additions and 7 deletions

View File

@ -2,6 +2,8 @@ name: Scrape transfer and update file
run-name: Scrape transfer and update file
on:
workflow_dispatch:
schedule:
- cron: '0 10 * * *'
jobs:
setup:
@ -17,13 +19,14 @@ jobs:
echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT
outputs:
matrix-params: ${{ steps.matrix-params.outputs.matrix-params }}
scrape-page:
name: Scrape page
runs-on: ubuntu-latest
needs: setup
strategy:
matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }}
fail-fast: false
fail-fast: true
steps:
- name: Checkout scraping repo
uses: actions/checkout@v4
@ -51,3 +54,34 @@ jobs:
name: transfer-page-${{ matrix.page }}
path: transfer_scraper/transfer_${{ matrix.page }}.json
commit-data:
name: Combine and commit data
runs-on: ubuntu-latest
needs: scrape-page
steps:
- name: Checkout data repo
uses: actions/checkout@v4
with:
repository: quatalog/data
path: data
token: ${{ secrets.PUSH_TOKEN }}
- name: Download partial JSONs
uses: actions/download-artifact@v4
with:
pattern: transfer-page-*
merge-multiple: true
path: new-data
- name: Combine JSONs
run: |
cat new-data/* | jq -s 'add' > data/transfer.json
- name: Commit data
working-directory: data
run: |
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
git add transfer.json
git commit -m "$(date)" || exit 0
git push

View File

@ -89,11 +89,23 @@ def scrape_page(page_num):
global driver
global options
driver = webdriver.Firefox(options=options)
driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
)
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
for i in range(1, 15):
try:
driver = webdriver.Firefox(options=options)
driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
)
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
break
except Exception as e:
driver.quit()
print(
f"Attempt {i} failed to load page, retrying in 25 seconds...",
file=sys.stderr,
)
sleep(25)
else:
raise Exception(f"Failed to load the main page after 15 attempts, aborting.")
num_institutions = len(
driver.find_elements(
@ -103,7 +115,22 @@ def scrape_page(page_num):
driver.quit()
print(f"Scraping page {page_num}, found {num_institutions} links", file=sys.stderr)
return [scrape_institution(i, page_num) for i in range(0, num_institutions)]
return [scrape_institution_safe(i, page_num) for i in range(0, num_institutions)]
def scrape_institution_safe(index, page_num):
for i in range(1, 15):
try:
return scrape_institution(index, page_num)
except Exception as e:
driver.quit()
print(
f"\tAttempt {i} failed due to {type(e).__name__}: {e}, retrying in 25 seconds...",
file=sys.stderr,
)
sleep(25)
else:
raise Exception(f"Failed to scrape {index} after 15 attempts, aborting.")
# scrape_institution: Scrapes an institution by index.