Compare commits

..

No commits in common. "d03be03aebf08b2e43bd9c870eac4db2c495881a" and "382f9080e5cb956b66ec1093467b1573ae0a41f9" have entirely different histories.

3 changed files with 221 additions and 274 deletions

View file

@ -10,12 +10,12 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout scraping repo - name: Checkout scraping repo
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
path: quatalog-scraping path: quatalog-scraping
- name: Clone QuACS data - name: Clone QuACS data
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
repository: quacs/quacs-data repository: quacs/quacs-data
path: quacs-data path: quacs-data
@ -36,7 +36,7 @@ jobs:
rsync -avz "quacs-data/semester_data/$CURRENT_TERM/catalog.json" new-data/catalog.json rsync -avz "quacs-data/semester_data/$CURRENT_TERM/catalog.json" new-data/catalog.json
- name: Upload data to artifact - name: Upload data to artifact
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v3
with: with:
name: new-data name: new-data
path: new-data/ path: new-data/
@ -47,14 +47,14 @@ jobs:
needs: [scrape-data] needs: [scrape-data]
steps: steps:
- name: Clone Quatalog data - name: Clone Quatalog data
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
repository: quatalog/data repository: quatalog/data
path: quatalog-data path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }} token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact - name: Download data from artifact
uses: actions/download-artifact@v4 uses: actions/download-artifact@v3
with: with:
name: new-data name: new-data
path: new-data path: new-data
@ -78,12 +78,12 @@ jobs:
needs: [scrape-data] needs: [scrape-data]
steps: steps:
- name: Checkout scraping repo - name: Checkout scraping repo
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
path: quatalog-scraping path: quatalog-scraping
- name: Clone Quatalog static site - name: Clone Quatalog static site
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
repository: quatalog/site repository: quatalog/site
ref: static-generated ref: static-generated
@ -91,7 +91,7 @@ jobs:
token: ${{ secrets.PUSH_TOKEN }} token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact - name: Download data from artifact
uses: actions/download-artifact@v4 uses: actions/download-artifact@v3
with: with:
name: new-data name: new-data
path: new-data path: new-data
@ -129,14 +129,14 @@ jobs:
needs: [generate-site] needs: [generate-site]
steps: steps:
- name: Checkout site repo/static-generated branch - name: Checkout site repo/static-generated branch
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
repository: quatalog/site repository: quatalog/site
ref: static-generated ref: static-generated
path: static-generated path: static-generated
- name: Checkout data repo - name: Checkout data repo
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
repository: quatalog/data repository: quatalog/data
path: quatalog-data path: quatalog-data
@ -167,13 +167,13 @@ jobs:
needs: [generate-site] needs: [generate-site]
steps: steps:
- name: Checkout static-generated branch - name: Checkout static-generated branch
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
repository: quatalog/site repository: quatalog/site
ref: static-generated ref: static-generated
- name: Setup Pages - name: Setup Pages
uses: actions/configure-pages@v4 uses: actions/configure-pages@v3
- name: Archive github-pages artifact - name: Archive github-pages artifact
run: | run: |
@ -186,7 +186,7 @@ jobs:
-cf "$RUNNER_TEMP/artifact.tar" . -cf "$RUNNER_TEMP/artifact.tar" .
- name: Upload github-pages artifact - name: Upload github-pages artifact
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v3
with: with:
name: github-pages name: github-pages
path: ${{ runner.temp }}/artifact.tar path: ${{ runner.temp }}/artifact.tar
@ -207,4 +207,4 @@ jobs:
steps: steps:
- name: Deploy to GitHub Pages - name: Deploy to GitHub Pages
id: deployment id: deployment
uses: actions/deploy-pages@v4 uses: actions/deploy-pages@v1

View file

@ -1,22 +1,13 @@
name: Scrape transfer and update file name: Scrape transfer and update file
run-name: Scrape transfer and update file
on: on:
# schedule:
# - cron: '*/15 * * * *'
repository_dispatch:
types: transfer-scraper
workflow_dispatch: workflow_dispatch:
inputs: # schedule:
timeout: # - cron: '15 * * * *'
description: "Timeout time"
required: true
type: number
default: 120
concurrency: concurrency:
group: transfer-scraper group: transfer-scraper
jobs: jobs:
scrape-data: scrape-transfer:
name: Scrape transfer guide name: Scrape transfer guide
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
@ -25,40 +16,34 @@ jobs:
with: with:
path: quatalog-scraping path: quatalog-scraping
- name: Checkout data repo
uses: actions/checkout@v4
with:
repository: quatalog/data
path: data
- name: Set up python - name: Set up python
uses: actions/setup-python@v5 uses: actions/setup-python@v4
with: with:
python-version: '3.11' python-version: '3.11'
cache: 'pip' cache: 'pip'
- name: Install dependencies - name: Install dependencies
working-directory: quatalog-scraping/transfer_scraper working-directory: quatalog-scraping/transfer_scraper
run: | run: pip install -r 'requirements.txt'
python -m pip install --upgrade pip
pip install -r 'requirements.txt'
- name: Log IP - name: Log IP
run: | run: echo "Public IP: $(curl -s 'https://ipinfo.io/ip')"
echo "Public IP: $(curl -s 'https://ipinfo.io/ip')"
- name: Retrieve existing data
run:
mkdir data
cd data
wget 'https://raw.githubusercontent.com/powe97/rpi-transfer-scraper/main/transfer.json'
wget 'https://raw.githubusercontent.com/powe97/rpi-transfer-scraper/main/transfer_state.json'
- name: Scrape transfer guide - name: Scrape transfer guide
run: | run: python3 quatalog-scraping/transfer_scraper data/transfer.json data/transfer_state.json
mkdir new-data
rsync -avzh data/transfer.json new-data
rsync -avzh data/transfer_state.json new-data
python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout }}
- name: Upload data to artifact - name: Upload data to artifact
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: transfer-data name: transfer-data
path: new-data/ path: data/
push-new-data: push-new-data:
name: Push new data to data repo name: Push new data to data repo
@ -66,21 +51,21 @@ jobs:
needs: [scrape-data] needs: [scrape-data]
steps: steps:
- name: Clone Quatalog data - name: Clone Quatalog data
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
repository: quatalog/data repository: quatalog/data
path: quatalog-data path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }} token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact - name: Download data from artifact
uses: actions/download-artifact@v4 uses: actions/download-artifact@v3
with: with:
name: transfer-data name: transfer-data
path: data path: data/
- name: Copy data to repo directory - name: Copy data to repo directory
run: | run: |
rsync -avzh data/ quatalog-data/ rsync -avz data/ quatalog-data/
- name: Push new data - name: Push new data
working-directory: quatalog-data working-directory: quatalog-data
@ -90,17 +75,3 @@ jobs:
git add transfer.json transfer_state.json git add transfer.json transfer_state.json
git commit -m "$(date)" || exit 0 git commit -m "$(date)" || exit 0
git push git push
re-run-scraper:
name: Tell Github to run this workflow again
runs-on: ubuntu-latest
needs: [push-new-data]
steps:
- name: Tell Github to run this workflow again
run: |
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \
--request POST \
--data '{"event_type": "transfer-scraper"}' \
"https://api.github.com/repos/quatalog/quatalog/dispatches"

View file

@ -31,8 +31,6 @@ def normalize_class_name(input):
def wait(ec): def wait(ec):
global driver
WebDriverWait( WebDriverWait(
driver, 20, ignored_exceptions=[StaleElementReferenceException] driver, 20, ignored_exceptions=[StaleElementReferenceException]
).until(ec) ).until(ec)
@ -40,8 +38,6 @@ def wait(ec):
def scrape_course_card(html_id, i, note): def scrape_course_card(html_id, i, note):
global driver
trs = ( trs = (
driver.find_element("id", html_id) driver.find_element("id", html_id)
.find_elements(By.CSS_SELECTOR, ".course-detail")[i] .find_elements(By.CSS_SELECTOR, ".course-detail")[i]
@ -104,31 +100,19 @@ def scrape_course_card(html_id, i, note):
} }
def main(): if len(sys.argv) != 3:
global driver print(f"USAGE: python {sys.argv[0]} <transfer file> <state file>")
if len(sys.argv) != 3 and len(sys.argv) != 4:
print(
f"USAGE: python {sys.argv[0]} <transfer file> <state file> [timeout minutes]"
)
exit(1) exit(1)
transfer_json_path = sys.argv[1] transfer_json_path = sys.argv[1]
state_json_path = sys.argv[2] state_json_path = sys.argv[2]
timeout_seconds = int(sys.argv[3] if len(sys.argv) == 4 else 120) * 60
# Set up timeout so that the GH action does not run forever, pretend it's ^C
print(f"Setting timeout to {timeout_seconds} seconds", file=sys.stderr)
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
alarm(timeout_seconds)
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.add_argument("--headless")
user_agent = UserAgent().random user_agent = UserAgent().random
options.set_preference("general.useragent.override", user_agent)
print(f"Using randomized user agent {user_agent}", file=sys.stderr) print(f"Using randomized user agent {user_agent}", file=sys.stderr)
if sys.argv[-1] != "gui":
options.add_argument("--headless")
options.set_preference("general.useragent.override", user_agent)
driver = webdriver.Firefox(options=options) driver = webdriver.Firefox(options=options)
driver.get( driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
@ -152,6 +136,12 @@ def main():
json.dump(state, sys.stderr, indent=4) json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr) print("", file=sys.stderr)
# Set up 2hr timeout so that the GH action does not run forever, pretend it's ^C
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
alarm(60 * 60 * 2)
try: try:
curr_page = 1 curr_page = 1
while state["inst_pg"] <= num_pages: while state["inst_pg"] <= num_pages:
@ -159,6 +149,7 @@ def main():
if state["inst_pg"] != 1: if state["inst_pg"] != 1:
while curr_page != state["inst_pg"]: while curr_page != state["inst_pg"]:
print(f"Jumping to institution page {curr_page}", file=sys.stderr)
jumpable_pages = { jumpable_pages = {
int(x.get_attribute("href").split("'")[3][5:]): x int(x.get_attribute("href").split("'")[3][5:]): x
for x in driver.find_elements( for x in driver.find_elements(
@ -180,7 +171,6 @@ def main():
else: else:
jumpable_pages[max(jumpable_pages)].click() jumpable_pages[max(jumpable_pages)].click()
curr_page = max(jumpable_pages) curr_page = max(jumpable_pages)
print(f"Jumping to institution page {curr_page}", file=sys.stderr)
wait(EC.staleness_of(page)) wait(EC.staleness_of(page))
sleep(random.uniform(3, 6)) sleep(random.uniform(3, 6))
@ -192,13 +182,9 @@ def main():
) )
) )
while state["inst_idx"] < inst_list_len: while state["inst_idx"] < inst_list_len:
institution_link = driver.find_element( institution_link = driver.find_element("id", "gdvInstWithEQ").find_elements(
"id", "gdvInstWithEQ"
).find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)[ )[state["inst_idx"]]
state["inst_idx"]
]
fields = institution_link.find_element(By.XPATH, "../..").find_elements( fields = institution_link.find_element(By.XPATH, "../..").find_elements(
By.CSS_SELECTOR, ".gdv_boundfield_uppercase" By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
) )
@ -243,7 +229,7 @@ def main():
wait( wait(
EC.element_to_be_clickable( EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".modal-header button") (By.CSS_SELECTOR, ".modal-header button")
), )
) )
transfer = [ transfer = [
@ -253,9 +239,7 @@ def main():
len( len(
driver.find_element( driver.find_element(
"id", "lblSendCourseEQDetail" "id", "lblSendCourseEQDetail"
).find_elements( ).find_elements(By.CSS_SELECTOR, ".course-detail")
By.CSS_SELECTOR, ".course-detail"
)
), ),
) )
] ]
@ -267,9 +251,7 @@ def main():
len( len(
driver.find_element( driver.find_element(
"id", "lblReceiveCourseEQDetail" "id", "lblReceiveCourseEQDetail"
).find_elements( ).find_elements(By.CSS_SELECTOR, ".course-detail")
By.CSS_SELECTOR, ".course-detail"
)
), ),
) )
] ]
@ -282,9 +264,7 @@ def main():
begin_date = driver.find_element( begin_date = driver.find_element(
"id", "lblBeginEffectiveDate" "id", "lblBeginEffectiveDate"
).text ).text
end_date = driver.find_element( end_date = driver.find_element("id", "lblEndEffectiveDate").text
"id", "lblEndEffectiveDate"
).text
driver.find_element( driver.find_element(
By.CSS_SELECTOR, ".modal-header button" By.CSS_SELECTOR, ".modal-header button"
@ -299,7 +279,7 @@ def main():
} }
] ]
state["course_idx"] += 1 state["course_idx"] += 1
except (Exception, KeyboardInterrupt) as e: except Exception as e:
institutions.update( institutions.update(
{ {
inst_name: { inst_name: {
@ -322,7 +302,7 @@ def main():
wait( wait(
EC.text_to_be_present_in_element( EC.text_to_be_present_in_element(
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"]) ("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
), )
) )
state["inst_idx"] = 0 state["inst_idx"] = 0
state["inst_pg"] = (state["inst_pg"] % num_pages) + 1 state["inst_pg"] = (state["inst_pg"] % num_pages) + 1
@ -339,7 +319,3 @@ def main():
with open(state_json_path, "w") as statejson: with open(state_json_path, "w") as statejson:
json.dump(state, statejson, indent=4) json.dump(state, statejson, indent=4)
driver.quit() driver.quit()
if __name__ == "__main__":
main()