Compare commits

...

8 Commits

2 changed files with 76 additions and 50 deletions

View File

@ -1,5 +1,7 @@
name: Scrape transfer and update file
run-name: Scrape transfer and update file
env:
DEFAULT_TIMEOUT: 45
on:
# schedule:
# - cron: '*/15 * * * *'
@ -11,7 +13,7 @@ on:
description: "Timeout time"
required: true
type: number
default: 120
default: 2
concurrency:
group: transfer-scraper
@ -52,7 +54,7 @@ jobs:
mkdir new-data
rsync -avzh data/transfer.json new-data
rsync -avzh data/transfer_state.json new-data
python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout }}
python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout || env.DEFAULT_TIMEOUT }}
- name: Upload data to artifact
uses: actions/upload-artifact@v4

View File

@ -21,13 +21,12 @@ def raise_(ex):
raise ex
def normalize_class_name(input):
text = list(input)
for i in range(1, len(text)):
if (text[i - 1] == " ") or (text[i - 1] == text[i] == "I"):
continue
text[i] = text[i].lower()
return "".join(text)
# Fix course titles accounting for Roman numerals up to X
def normalize_title(input):
s = " ".join(input.split())
s = re.sub(r"[A-Za-z]+(['][A-Za-z]+)?", lambda m: m.group(0).capitalize(), s)
s = re.sub(r"\b(Viii|Vii|Vi|Iv|Ix|Iii|Ii)\b", lambda a: a.group(0).upper(), s)
return s.strip()
def wait(ec):
@ -53,12 +52,12 @@ def scrape_course_card(html_id, i, note):
if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
course_desc = trs[1].text
course_department = (
course_department = normalize_title(
next((x for x in trs if x.text.strip().startswith("Department:")))
.find_elements(By.TAG_NAME, "td")[1]
.text.title()
.text
)
course_catalog = (
course_catalog = normalize_title(
next((x for x in trs if x.text.strip().startswith("Source catalog:")))
.find_elements(By.TAG_NAME, "td")[1]
.text
@ -69,10 +68,10 @@ def scrape_course_card(html_id, i, note):
i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v))
)
course_id = " ".join(course_name_and_id[0:k])
course_name = normalize_class_name(" ".join(course_name_and_id[k:]))
course_name = normalize_title(" ".join(course_name_and_id[k:]))
except StopIteration: # Handling for Not Transferrable
course_id = course_name_and_id[0]
course_name = normalize_class_name(" ".join(course_name_and_id[1:]))
course_name = normalize_title(" ".join(course_name_and_id[1:]))
if not note:
try:
@ -104,6 +103,38 @@ def scrape_course_card(html_id, i, note):
}
def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
page = driver.find_element("id", postback_type)
if num_pages == 1:
return 1, page
while curr_page != to_page:
jumpable_pages = {
int(x.get_attribute("href").split("'")[3][5:]): x
for x in driver.find_elements(
By.CSS_SELECTOR,
"""a[href^="javascript:__doPostBack('"""
+ postback_type
+ """','Page$"]""",
)
}
curr_page = int(driver.find_element("id", pagination_type).text.split()[-3])
if to_page in jumpable_pages:
jumpable_pages[to_page].click()
curr_page = to_page
elif to_page < min(jumpable_pages):
jumpable_pages[min(jumpable_pages)].click()
curr_page = min(jumpable_pages)
else:
jumpable_pages[max(jumpable_pages)].click()
curr_page = max(jumpable_pages)
print(f"Jumping to {postback_type} page {curr_page}", file=sys.stderr)
wait(EC.staleness_of(page))
sleep(random.uniform(3, 6))
page = driver.find_element("id", postback_type)
return curr_page, page
def main():
global driver
@ -127,6 +158,10 @@ def main():
user_agent = UserAgent().random
options.set_preference("general.useragent.override", user_agent)
# options.set_preference("network.proxy.socks", "")
# options.set_preference("network.proxy.socks_port", )
# options.set_preference("network.proxy.socks_remote_dns", True)
# options.set_preference("network.proxy.type", 1)
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
driver = webdriver.Firefox(options=options)
@ -153,44 +188,22 @@ def main():
print("", file=sys.stderr)
try:
curr_page = 1
curr_inst_page = 1
while state["inst_pg"] <= num_pages:
page = driver.find_element("id", f"gdvInstWithEQ")
if state["inst_pg"] != 1:
while curr_page != state["inst_pg"]:
jumpable_pages = {
int(x.get_attribute("href").split("'")[3][5:]): x
for x in driver.find_elements(
By.CSS_SELECTOR,
"""a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""",
)
}
curr_page = int(
driver.find_element(
"id", "lblInstWithEQPaginationInfo"
).text.split()[-3]
)
if state["inst_pg"] in jumpable_pages:
jumpable_pages[state["inst_pg"]].click()
curr_page = state["inst_pg"]
elif state["inst_pg"] < min(jumpable_pages):
jumpable_pages[min(jumpable_pages)].click()
curr_page = min(jumpable_pages)
else:
jumpable_pages[max(jumpable_pages)].click()
curr_page = max(jumpable_pages)
print(f"Jumping to institution page {curr_page}", file=sys.stderr)
wait(EC.staleness_of(page))
sleep(random.uniform(3, 6))
page = driver.find_element("id", f"gdvInstWithEQ")
curr_inst_page, page = jump_to_page(
curr_inst_page,
state["inst_pg"],
num_pages,
"gdvInstWithEQ",
"lblInstWithEQPaginationInfo",
)
inst_list_len = len(
page.find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)
)
while state["inst_idx"] < inst_list_len:
institution_link = driver.find_element(
"id", "gdvInstWithEQ"
@ -202,8 +215,8 @@ def main():
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
)
inst_name = institution_link.text.title().strip()
city = fields[0].text.title().strip()
inst_name = normalize_title(institution_link.text)
city = normalize_title(fields[0].text)
us_state = fields[1].text.strip()
institution_link.click()
@ -212,7 +225,7 @@ def main():
try:
course_pages_len = int(
driver.find_element(
"id", "lblInstWithEQPaginationInfo"
"id", "lblCourseEQPaginationInfo"
).text.split()[-1]
)
except NoSuchElementException:
@ -220,15 +233,25 @@ def main():
try:
courses = institutions[inst_name]["courses"]
except:
except Exception:
courses = []
curr_course_page = 1
while state["course_pg"] <= course_pages_len:
curr_course_page, page = jump_to_page(
curr_course_page,
state["course_pg"],
course_pages_len,
"gdvCourseEQ",
"lblCourseEQPaginationInfo",
)
course_links_len = len(
driver.find_element("id", "gdvCourseEQ").find_elements(
page.find_elements(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
)
)
while state["course_idx"] < course_links_len:
course_link = driver.find_element(
"id", "gdvCourseEQ"
@ -312,6 +335,7 @@ def main():
raise e
state["course_idx"] = 0
state["course_pg"] += 1
institutions.update(
{inst_name: {"city": city, "state": us_state, "courses": courses}}
)