Compare commits

...

21 Commits

Author SHA1 Message Date
powe97 d03be03aeb
Move debug print to be more accurate 2024-03-01 01:50:01 -05:00
powe97 019b777228
Make transfer scraper run continuously (at least as much as Github allows) 2024-03-01 01:45:06 -05:00
powe97 1a4542e20e
Fix crashing without timeout arg and re-add --headless 2024-03-01 00:29:34 -05:00
powe97 b0acd0e745
Dammit python 2024-02-29 22:31:09 -05:00
powe97 53891400ea
Every 15 minutes 2024-02-29 22:29:51 -05:00
powe97 c6e28d399a
Make timeout field have default value 2024-02-29 22:28:00 -05:00
powe97 682b1679b4
Run every 15 mins 2024-02-29 22:25:22 -05:00
powe97 aa4af079f8
Merge branch 'main' of https://github.com/quatalog/quatalog 2024-02-29 22:13:54 -05:00
powe97 cf2abf7193
Fix partial updates when KeyboardInterrupt happens mid-institution 2024-02-29 22:13:44 -05:00
powe97 55e34c9dd4
Bump versions of actions 2024-02-29 22:06:23 -05:00
powe97 efad1e9103
Bump versions for actions 2024-02-29 22:01:40 -05:00
powe97 cf953b2f02
Merge branch 'main' of https://github.com/quatalog/quatalog 2024-02-29 21:45:29 -05:00
powe97 44067261c3
Don't put whole repo in artifact 2024-02-29 21:45:22 -05:00
powe97 d268233d8b
Update transfer.yml 2024-02-29 21:38:17 -05:00
powe97 8a3e8a84d8
See previous commit 2024-02-29 21:25:53 -05:00
powe97 fd2da56aee
Make checkout data repo actually check the data repo out 2024-02-29 21:23:29 -05:00
powe97 12d844ca28
Fix global var fuckery 2024-02-29 21:21:39 -05:00
powe97 4916feeb19
Add debug timeout to workflow 2024-02-29 21:16:07 -05:00
powe97 b304e9f8d2
Fix scraper 2024-02-29 21:02:38 -05:00
powe97 f216c45748
Add if __name__ == "__main__" and fix workflow 2024-02-29 20:49:45 -05:00
powe97 15b09123ee
Set up workflow for transfer scraper 2024-02-29 20:40:15 -05:00
3 changed files with 290 additions and 237 deletions

View File

@ -10,12 +10,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout scraping repo
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
path: quatalog-scraping
- name: Clone QuACS data
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: quacs/quacs-data
path: quacs-data
@ -36,7 +36,7 @@ jobs:
rsync -avz "quacs-data/semester_data/$CURRENT_TERM/catalog.json" new-data/catalog.json
- name: Upload data to artifact
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: new-data
path: new-data/
@ -47,14 +47,14 @@ jobs:
needs: [scrape-data]
steps:
- name: Clone Quatalog data
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: quatalog/data
path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: new-data
path: new-data
@ -78,12 +78,12 @@ jobs:
needs: [scrape-data]
steps:
- name: Checkout scraping repo
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
path: quatalog-scraping
- name: Clone Quatalog static site
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: quatalog/site
ref: static-generated
@ -91,7 +91,7 @@ jobs:
token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: new-data
path: new-data
@ -129,14 +129,14 @@ jobs:
needs: [generate-site]
steps:
- name: Checkout site repo/static-generated branch
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: quatalog/site
ref: static-generated
path: static-generated
- name: Checkout data repo
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: quatalog/data
path: quatalog-data
@ -167,13 +167,13 @@ jobs:
needs: [generate-site]
steps:
- name: Checkout static-generated branch
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: quatalog/site
ref: static-generated
- name: Setup Pages
uses: actions/configure-pages@v3
uses: actions/configure-pages@v4
- name: Archive github-pages artifact
run: |
@ -186,7 +186,7 @@ jobs:
-cf "$RUNNER_TEMP/artifact.tar" .
- name: Upload github-pages artifact
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: github-pages
path: ${{ runner.temp }}/artifact.tar
@ -207,4 +207,4 @@ jobs:
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v1
uses: actions/deploy-pages@v4

View File

@ -1,13 +1,22 @@
name: Scrape transfer and update file
run-name: Scrape transfer and update file
on:
# schedule:
# - cron: '*/15 * * * *'
repository_dispatch:
types: transfer-scraper
workflow_dispatch:
# schedule:
# - cron: '15 * * * *'
inputs:
timeout:
description: "Timeout time"
required: true
type: number
default: 120
concurrency:
group: transfer-scraper
jobs:
scrape-transfer:
scrape-data:
name: Scrape transfer guide
runs-on: ubuntu-latest
steps:
@ -16,34 +25,40 @@ jobs:
with:
path: quatalog-scraping
- name: Checkout data repo
uses: actions/checkout@v4
with:
repository: quatalog/data
path: data
- name: Set up python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Install dependencies
working-directory: quatalog-scraping/transfer_scraper
run: pip install -r 'requirements.txt'
run: |
python -m pip install --upgrade pip
pip install -r 'requirements.txt'
- name: Log IP
run: echo "Public IP: $(curl -s 'https://ipinfo.io/ip')"
- name: Retrieve existing data
run:
mkdir data
cd data
wget 'https://raw.githubusercontent.com/powe97/rpi-transfer-scraper/main/transfer.json'
wget 'https://raw.githubusercontent.com/powe97/rpi-transfer-scraper/main/transfer_state.json'
run: |
echo "Public IP: $(curl -s 'https://ipinfo.io/ip')"
- name: Scrape transfer guide
run: python3 quatalog-scraping/transfer_scraper data/transfer.json data/transfer_state.json
run: |
mkdir new-data
rsync -avzh data/transfer.json new-data
rsync -avzh data/transfer_state.json new-data
python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout }}
- name: Upload data to artifact
uses: actions/upload-artifact@v4
with:
name: transfer-data
path: data/
path: new-data/
push-new-data:
name: Push new data to data repo
@ -51,21 +66,21 @@ jobs:
needs: [scrape-data]
steps:
- name: Clone Quatalog data
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
repository: quatalog/data
path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: transfer-data
path: data/
path: data
- name: Copy data to repo directory
run: |
rsync -avz data/ quatalog-data/
rsync -avzh data/ quatalog-data/
- name: Push new data
working-directory: quatalog-data
@ -75,3 +90,17 @@ jobs:
git add transfer.json transfer_state.json
git commit -m "$(date)" || exit 0
git push
re-run-scraper:
name: Tell Github to run this workflow again
runs-on: ubuntu-latest
needs: [push-new-data]
steps:
- name: Tell Github to run this workflow again
run: |
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \
--request POST \
--data '{"event_type": "transfer-scraper"}' \
"https://api.github.com/repos/quatalog/quatalog/dispatches"

View File

@ -31,6 +31,8 @@ def normalize_class_name(input):
def wait(ec):
global driver
WebDriverWait(
driver, 20, ignored_exceptions=[StaleElementReferenceException]
).until(ec)
@ -38,6 +40,8 @@ def wait(ec):
def scrape_course_card(html_id, i, note):
global driver
trs = (
driver.find_element("id", html_id)
.find_elements(By.CSS_SELECTOR, ".course-detail")[i]
@ -100,222 +104,242 @@ def scrape_course_card(html_id, i, note):
}
if len(sys.argv) != 3:
print(f"USAGE: python {sys.argv[0]} <transfer file> <state file>")
exit(1)
def main():
global driver
transfer_json_path = sys.argv[1]
state_json_path = sys.argv[2]
options = webdriver.FirefoxOptions()
user_agent = UserAgent().random
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
if sys.argv[-1] != "gui":
options.add_argument("--headless")
options.set_preference("general.useragent.override", user_agent)
driver = webdriver.Firefox(options=options)
driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
)
num_pages = int(
driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1]
)
print(f"{num_pages} pages detected", file=sys.stderr)
state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0}
institutions = {}
if os.path.isfile(state_json_path):
with open(state_json_path, "r") as statejson:
state = json.load(statejson)
if os.path.isfile(transfer_json_path):
with open(transfer_json_path, "r") as transferjson:
institutions = json.load(transferjson)
print("Loaded state: ", end="", file=sys.stderr)
json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr)
# Set up 2hr timeout so that the GH action does not run forever, pretend it's ^C
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
alarm(60 * 60 * 2)
try:
curr_page = 1
while state["inst_pg"] <= num_pages:
page = driver.find_element("id", f"gdvInstWithEQ")
if state["inst_pg"] != 1:
while curr_page != state["inst_pg"]:
print(f"Jumping to institution page {curr_page}", file=sys.stderr)
jumpable_pages = {
int(x.get_attribute("href").split("'")[3][5:]): x
for x in driver.find_elements(
By.CSS_SELECTOR,
"""a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""",
)
}
curr_page = int(
driver.find_element(
"id", "lblInstWithEQPaginationInfo"
).text.split()[-3]
)
if state["inst_pg"] in jumpable_pages:
jumpable_pages[state["inst_pg"]].click()
curr_page = state["inst_pg"]
elif state["inst_pg"] < min(jumpable_pages):
jumpable_pages[min(jumpable_pages)].click()
curr_page = min(jumpable_pages)
else:
jumpable_pages[max(jumpable_pages)].click()
curr_page = max(jumpable_pages)
wait(EC.staleness_of(page))
sleep(random.uniform(3, 6))
page = driver.find_element("id", f"gdvInstWithEQ")
inst_list_len = len(
page.find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)
if len(sys.argv) != 3 and len(sys.argv) != 4:
print(
f"USAGE: python {sys.argv[0]} <transfer file> <state file> [timeout minutes]"
)
while state["inst_idx"] < inst_list_len:
institution_link = driver.find_element("id", "gdvInstWithEQ").find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)[state["inst_idx"]]
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
)
inst_name = institution_link.text.title().strip()
city = fields[0].text.title().strip()
us_state = fields[1].text.strip()
exit(1)
institution_link.click()
wait(EC.staleness_of(institution_link))
transfer_json_path = sys.argv[1]
state_json_path = sys.argv[2]
timeout_seconds = int(sys.argv[3] if len(sys.argv) == 4 else 120) * 60
try:
course_pages_len = int(
driver.find_element(
"id", "lblInstWithEQPaginationInfo"
).text.split()[-1]
)
except NoSuchElementException:
course_pages_len = 1
# Set up timeout so that the GH action does not run forever, pretend it's ^C
print(f"Setting timeout to {timeout_seconds} seconds", file=sys.stderr)
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
alarm(timeout_seconds)
try:
courses = institutions[inst_name]["courses"]
except:
courses = []
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
while state["course_pg"] <= course_pages_len:
course_links_len = len(
driver.find_element("id", "gdvCourseEQ").find_elements(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
)
)
while state["course_idx"] < course_links_len:
course_link = driver.find_element(
"id", "gdvCourseEQ"
).find_elements(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
)[
state["course_idx"]
]
course_link.click()
user_agent = UserAgent().random
options.set_preference("general.useragent.override", user_agent)
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
try:
wait(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".modal-header button")
)
driver = webdriver.Firefox(options=options)
driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
)
num_pages = int(
driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1]
)
print(f"{num_pages} pages detected", file=sys.stderr)
state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0}
institutions = {}
if os.path.isfile(state_json_path):
with open(state_json_path, "r") as statejson:
state = json.load(statejson)
if os.path.isfile(transfer_json_path):
with open(transfer_json_path, "r") as transferjson:
institutions = json.load(transferjson)
print("Loaded state: ", end="", file=sys.stderr)
json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr)
try:
curr_page = 1
while state["inst_pg"] <= num_pages:
page = driver.find_element("id", f"gdvInstWithEQ")
if state["inst_pg"] != 1:
while curr_page != state["inst_pg"]:
jumpable_pages = {
int(x.get_attribute("href").split("'")[3][5:]): x
for x in driver.find_elements(
By.CSS_SELECTOR,
"""a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""",
)
transfer = [
scrape_course_card("lblSendCourseEQDetail", i, False)
for i in range(
0,
len(
driver.find_element(
"id", "lblSendCourseEQDetail"
).find_elements(By.CSS_SELECTOR, ".course-detail")
),
)
]
rpi = [
scrape_course_card("lblReceiveCourseEQDetail", i, True)
for i in range(
0,
len(
driver.find_element(
"id", "lblReceiveCourseEQDetail"
).find_elements(By.CSS_SELECTOR, ".course-detail")
),
)
]
print(
f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})",
file=sys.stderr,
)
begin_date = driver.find_element(
"id", "lblBeginEffectiveDate"
).text
end_date = driver.find_element("id", "lblEndEffectiveDate").text
}
curr_page = int(
driver.find_element(
By.CSS_SELECTOR, ".modal-header button"
).click()
"id", "lblInstWithEQPaginationInfo"
).text.split()[-3]
)
if state["inst_pg"] in jumpable_pages:
jumpable_pages[state["inst_pg"]].click()
curr_page = state["inst_pg"]
elif state["inst_pg"] < min(jumpable_pages):
jumpable_pages[min(jumpable_pages)].click()
curr_page = min(jumpable_pages)
else:
jumpable_pages[max(jumpable_pages)].click()
curr_page = max(jumpable_pages)
print(f"Jumping to institution page {curr_page}", file=sys.stderr)
courses += [
{
"transfer": transfer,
"rpi": rpi,
"begin": begin_date,
"end": end_date,
}
]
state["course_idx"] += 1
except Exception as e:
institutions.update(
{
inst_name: {
"city": city,
"state": us_state,
"courses": courses,
}
}
)
raise e
state["course_idx"] = 0
state["course_pg"] += 1
institutions.update(
{inst_name: {"city": city, "state": us_state, "courses": courses}}
)
state["course_pg"] = 1
state["inst_idx"] += 1
wait(EC.staleness_of(page))
sleep(random.uniform(3, 6))
page = driver.find_element("id", f"gdvInstWithEQ")
driver.find_element("id", "btnSwitchView").click()
wait(
EC.text_to_be_present_in_element(
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
inst_list_len = len(
page.find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)
)
state["inst_idx"] = 0
state["inst_pg"] = (state["inst_pg"] % num_pages) + 1
while state["inst_idx"] < inst_list_len:
institution_link = driver.find_element(
"id", "gdvInstWithEQ"
).find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)[
state["inst_idx"]
]
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
)
inst_name = institution_link.text.title().strip()
city = fields[0].text.title().strip()
us_state = fields[1].text.strip()
except (Exception, KeyboardInterrupt) as e:
print("Program hits exception and will save and terminate", file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
institution_link.click()
wait(EC.staleness_of(institution_link))
print("Program will terminate with state: ", end="", file=sys.stderr)
json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr)
with open(transfer_json_path, "w") as transferjson:
json.dump(institutions, transferjson, indent=4)
with open(state_json_path, "w") as statejson:
json.dump(state, statejson, indent=4)
driver.quit()
try:
course_pages_len = int(
driver.find_element(
"id", "lblInstWithEQPaginationInfo"
).text.split()[-1]
)
except NoSuchElementException:
course_pages_len = 1
try:
courses = institutions[inst_name]["courses"]
except:
courses = []
while state["course_pg"] <= course_pages_len:
course_links_len = len(
driver.find_element("id", "gdvCourseEQ").find_elements(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
)
)
while state["course_idx"] < course_links_len:
course_link = driver.find_element(
"id", "gdvCourseEQ"
).find_elements(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
)[
state["course_idx"]
]
course_link.click()
try:
wait(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".modal-header button")
),
)
transfer = [
scrape_course_card("lblSendCourseEQDetail", i, False)
for i in range(
0,
len(
driver.find_element(
"id", "lblSendCourseEQDetail"
).find_elements(
By.CSS_SELECTOR, ".course-detail"
)
),
)
]
rpi = [
scrape_course_card("lblReceiveCourseEQDetail", i, True)
for i in range(
0,
len(
driver.find_element(
"id", "lblReceiveCourseEQDetail"
).find_elements(
By.CSS_SELECTOR, ".course-detail"
)
),
)
]
print(
f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})",
file=sys.stderr,
)
begin_date = driver.find_element(
"id", "lblBeginEffectiveDate"
).text
end_date = driver.find_element(
"id", "lblEndEffectiveDate"
).text
driver.find_element(
By.CSS_SELECTOR, ".modal-header button"
).click()
courses += [
{
"transfer": transfer,
"rpi": rpi,
"begin": begin_date,
"end": end_date,
}
]
state["course_idx"] += 1
except (Exception, KeyboardInterrupt) as e:
institutions.update(
{
inst_name: {
"city": city,
"state": us_state,
"courses": courses,
}
}
)
raise e
state["course_idx"] = 0
state["course_pg"] += 1
institutions.update(
{inst_name: {"city": city, "state": us_state, "courses": courses}}
)
state["course_pg"] = 1
state["inst_idx"] += 1
driver.find_element("id", "btnSwitchView").click()
wait(
EC.text_to_be_present_in_element(
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
),
)
state["inst_idx"] = 0
state["inst_pg"] = (state["inst_pg"] % num_pages) + 1
except (Exception, KeyboardInterrupt) as e:
print("Program hits exception and will save and terminate", file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
print("Program will terminate with state: ", end="", file=sys.stderr)
json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr)
with open(transfer_json_path, "w") as transferjson:
json.dump(institutions, transferjson, indent=4)
with open(state_json_path, "w") as statejson:
json.dump(state, statejson, indent=4)
driver.quit()
if __name__ == "__main__":
main()