Compare commits
9 Commits
a0b9081f8f
...
aec272e28e
Author | SHA1 | Date |
---|---|---|
powe97 | aec272e28e | |
powe97 | c21dec05ad | |
powe97 | 81ba2fdc80 | |
powe97 | 92c3327b1a | |
powe97 | 6a1395c054 | |
powe97 | 912b07f6f3 | |
powe97 | 8b15438a98 | |
powe97 | 0007bde18a | |
powe97 | c98b928125 |
|
@ -2,6 +2,8 @@ name: Scrape transfer and update file
|
|||
run-name: Scrape transfer and update file
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 10 * * *'
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
|
@ -17,13 +19,14 @@ jobs:
|
|||
echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT
|
||||
outputs:
|
||||
matrix-params: ${{ steps.matrix-params.outputs.matrix-params }}
|
||||
|
||||
scrape-page:
|
||||
name: Scrape page
|
||||
runs-on: ubuntu-latest
|
||||
needs: setup
|
||||
strategy:
|
||||
matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }}
|
||||
fail-fast: false
|
||||
fail-fast: true
|
||||
steps:
|
||||
- name: Checkout scraping repo
|
||||
uses: actions/checkout@v4
|
||||
|
@ -51,3 +54,34 @@ jobs:
|
|||
name: transfer-page-${{ matrix.page }}
|
||||
path: transfer_scraper/transfer_${{ matrix.page }}.json
|
||||
|
||||
commit-data:
|
||||
name: Combine and commit data
|
||||
runs-on: ubuntu-latest
|
||||
needs: scrape-page
|
||||
steps:
|
||||
- name: Checkout data repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: quatalog/data
|
||||
path: data
|
||||
token: ${{ secrets.PUSH_TOKEN }}
|
||||
|
||||
- name: Download partial JSONs
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: transfer-page-*
|
||||
merge-multiple: true
|
||||
path: new-data
|
||||
|
||||
- name: Combine JSONs
|
||||
run: |
|
||||
cat new-data/* | jq -s 'add' > data/transfer.json
|
||||
|
||||
- name: Commit data
|
||||
working-directory: data
|
||||
run: |
|
||||
git config user.name "Quatalog Updater"
|
||||
git config user.email "github_actions@quatalog.com"
|
||||
git add transfer.json
|
||||
git commit -m "$(date)" || exit 0
|
||||
git push
|
||||
|
|
|
@ -89,11 +89,23 @@ def scrape_page(page_num):
|
|||
global driver
|
||||
global options
|
||||
|
||||
driver = webdriver.Firefox(options=options)
|
||||
driver.get(
|
||||
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
||||
)
|
||||
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
|
||||
for i in range(1, 15):
|
||||
try:
|
||||
driver = webdriver.Firefox(options=options)
|
||||
driver.get(
|
||||
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
||||
)
|
||||
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
|
||||
break
|
||||
except Exception as e:
|
||||
driver.quit()
|
||||
print(
|
||||
f"Attempt {i} failed to load page, retrying in 25 seconds...",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sleep(25)
|
||||
else:
|
||||
raise Exception(f"Failed to load the main page after 15 attempts, aborting.")
|
||||
|
||||
num_institutions = len(
|
||||
driver.find_elements(
|
||||
|
@ -103,7 +115,22 @@ def scrape_page(page_num):
|
|||
driver.quit()
|
||||
|
||||
print(f"Scraping page {page_num}, found {num_institutions} links", file=sys.stderr)
|
||||
return [scrape_institution(i, page_num) for i in range(0, num_institutions)]
|
||||
return [scrape_institution_safe(i, page_num) for i in range(0, num_institutions)]
|
||||
|
||||
|
||||
def scrape_institution_safe(index, page_num):
|
||||
for i in range(1, 15):
|
||||
try:
|
||||
return scrape_institution(index, page_num)
|
||||
except Exception as e:
|
||||
driver.quit()
|
||||
print(
|
||||
f"\tAttempt {i} failed due to {type(e).__name__}: {e}, retrying in 25 seconds...",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sleep(25)
|
||||
else:
|
||||
raise Exception(f"Failed to scrape {index} after 15 attempts, aborting.")
|
||||
|
||||
|
||||
# scrape_institution: Scrapes an institution by index.
|
||||
|
|
Loading…
Reference in New Issue