diff --git a/transfer_scraper/main.py b/transfer_scraper/main.py new file mode 100644 index 0000000..bbcf81d --- /dev/null +++ b/transfer_scraper/main.py @@ -0,0 +1,321 @@ +import json +import html +import sys +import re +import os.path +import traceback +from time import sleep +import random +from signal import alarm, SIGALRM, signal +from fake_useragent import UserAgent +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import StaleElementReferenceException +from selenium.common.exceptions import TimeoutException +from selenium.common.exceptions import NoSuchElementException + + +def raise_(ex): + raise ex + + +def normalize_class_name(input): + text = list(input) + for i in range(1, len(text)): + if (text[i - 1] == " ") or (text[i - 1] == text[i] == "I"): + continue + text[i] = text[i].lower() + return "".join(text) + + +def wait(ec): + WebDriverWait( + driver, 20, ignored_exceptions=[StaleElementReferenceException] + ).until(ec) + sleep(random.uniform(400, 1900) / 1000) + + +def scrape_course_card(html_id, i, note): + trs = ( + driver.find_element("id", html_id) + .find_elements(By.CSS_SELECTOR, ".course-detail")[i] + .find_elements(By.TAG_NAME, "tr") + ) + course_name_and_id = trs[0].text.split() + + course_desc = "" + if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2": + course_desc = trs[1].text + + course_department = ( + next((x for x in trs if x.text.strip().startswith("Department:"))) + .find_elements(By.TAG_NAME, "td")[1] + .text.title() + ) + course_catalog = ( + next((x for x in trs if x.text.strip().startswith("Source catalog:"))) + .find_elements(By.TAG_NAME, "td")[1] + .text + ) + + try: + k = 1 + next( + i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v)) + ) + course_id = " ".join(course_name_and_id[0:k]) + course_name = normalize_class_name(" ".join(course_name_and_id[k:])) + except StopIteration: # Handling for Not Transferrable + course_id = course_name_and_id[0] + course_name = normalize_class_name(" ".join(course_name_and_id[1:])) + + if not note: + try: + course_credits = ( + next((x for x in trs if x.text.strip().startswith("Units:"))) + .find_elements(By.TAG_NAME, "td")[1] + .text.strip() + ) + except: + course_credits = "" + + return { + "id": course_id, + "name": course_name, + "credits": course_credits, + "desc": course_desc, + "department": course_department, + "catalog": course_catalog, + } + else: + course_note = driver.find_element("id", "lblCommentsPublic").text.strip() + return { + "id": course_id, + "name": course_name, + "note": course_note, + "desc": course_desc, + "department": course_department, + "catalog": course_catalog, + } + + +if len(sys.argv) != 3: + print(f"USAGE: python {sys.argv[0]} ") + exit(1) + +transfer_json_path = sys.argv[1] +state_json_path = sys.argv[2] + +options = webdriver.FirefoxOptions() +user_agent = UserAgent().random +print(f"Using randomized user agent {user_agent}", file=sys.stderr) +if sys.argv[-1] != "gui": + options.add_argument("--headless") +options.set_preference("general.useragent.override", user_agent) +driver = webdriver.Firefox(options=options) +driver.get( + "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" +) + +num_pages = int( + driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1] +) +print(f"{num_pages} pages detected", file=sys.stderr) + +state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0} +institutions = {} +if os.path.isfile(state_json_path): + with open(state_json_path, "r") as statejson: + state = json.load(statejson) +if os.path.isfile(transfer_json_path): + with open(transfer_json_path, "r") as transferjson: + institutions = json.load(transferjson) + +print("Loaded state: ", end="", file=sys.stderr) +json.dump(state, sys.stderr, indent=4) +print("", file=sys.stderr) + + +# Set up 2hr timeout so that the GH action does not run forever, pretend it's ^C +signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt)) +alarm(60 * 60 * 2) + + +try: + curr_page = 1 + while state["inst_pg"] <= num_pages: + page = driver.find_element("id", f"gdvInstWithEQ") + + if state["inst_pg"] != 1: + while curr_page != state["inst_pg"]: + print(f"Jumping to institution page {curr_page}", file=sys.stderr) + jumpable_pages = { + int(x.get_attribute("href").split("'")[3][5:]): x + for x in driver.find_elements( + By.CSS_SELECTOR, + """a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""", + ) + } + curr_page = int( + driver.find_element( + "id", "lblInstWithEQPaginationInfo" + ).text.split()[-3] + ) + if state["inst_pg"] in jumpable_pages: + jumpable_pages[state["inst_pg"]].click() + curr_page = state["inst_pg"] + elif state["inst_pg"] < min(jumpable_pages): + jumpable_pages[min(jumpable_pages)].click() + curr_page = min(jumpable_pages) + else: + jumpable_pages[max(jumpable_pages)].click() + curr_page = max(jumpable_pages) + + wait(EC.staleness_of(page)) + sleep(random.uniform(3, 6)) + page = driver.find_element("id", f"gdvInstWithEQ") + + inst_list_len = len( + page.find_elements( + By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" + ) + ) + while state["inst_idx"] < inst_list_len: + institution_link = driver.find_element("id", "gdvInstWithEQ").find_elements( + By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" + )[state["inst_idx"]] + fields = institution_link.find_element(By.XPATH, "../..").find_elements( + By.CSS_SELECTOR, ".gdv_boundfield_uppercase" + ) + inst_name = institution_link.text.title().strip() + city = fields[0].text.title().strip() + us_state = fields[1].text.strip() + + institution_link.click() + wait(EC.staleness_of(institution_link)) + + try: + course_pages_len = int( + driver.find_element( + "id", "lblInstWithEQPaginationInfo" + ).text.split()[-1] + ) + except NoSuchElementException: + course_pages_len = 1 + + try: + courses = institutions[inst_name]["courses"] + except: + courses = [] + + while state["course_pg"] <= course_pages_len: + course_links_len = len( + driver.find_element("id", "gdvCourseEQ").find_elements( + By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" + ) + ) + while state["course_idx"] < course_links_len: + course_link = driver.find_element( + "id", "gdvCourseEQ" + ).find_elements( + By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" + )[ + state["course_idx"] + ] + course_link.click() + + try: + wait( + EC.element_to_be_clickable( + (By.CSS_SELECTOR, ".modal-header button") + ) + ) + + transfer = [ + scrape_course_card("lblSendCourseEQDetail", i, False) + for i in range( + 0, + len( + driver.find_element( + "id", "lblSendCourseEQDetail" + ).find_elements(By.CSS_SELECTOR, ".course-detail") + ), + ) + ] + + rpi = [ + scrape_course_card("lblReceiveCourseEQDetail", i, True) + for i in range( + 0, + len( + driver.find_element( + "id", "lblReceiveCourseEQDetail" + ).find_elements(By.CSS_SELECTOR, ".course-detail") + ), + ) + ] + + print( + f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})", + file=sys.stderr, + ) + + begin_date = driver.find_element( + "id", "lblBeginEffectiveDate" + ).text + end_date = driver.find_element("id", "lblEndEffectiveDate").text + + driver.find_element( + By.CSS_SELECTOR, ".modal-header button" + ).click() + + courses += [ + { + "transfer": transfer, + "rpi": rpi, + "begin": begin_date, + "end": end_date, + } + ] + state["course_idx"] += 1 + except Exception as e: + institutions.update( + { + inst_name: { + "city": city, + "state": us_state, + "courses": courses, + } + } + ) + raise e + state["course_idx"] = 0 + state["course_pg"] += 1 + institutions.update( + {inst_name: {"city": city, "state": us_state, "courses": courses}} + ) + state["course_pg"] = 1 + state["inst_idx"] += 1 + + driver.find_element("id", "btnSwitchView").click() + wait( + EC.text_to_be_present_in_element( + ("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"]) + ) + ) + state["inst_idx"] = 0 + state["inst_pg"] = (state["inst_pg"] % num_pages) + 1 + +except (Exception, KeyboardInterrupt) as e: + print("Program hits exception and will save and terminate", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + +print("Program will terminate with state: ", end="", file=sys.stderr) +json.dump(state, sys.stderr, indent=4) +print("", file=sys.stderr) +with open(transfer_json_path, "w") as transferjson: + json.dump(institutions, transferjson, indent=4) +with open(state_json_path, "w") as statejson: + json.dump(state, statejson, indent=4) +driver.quit() diff --git a/transfer_scraper/requirements.txt b/transfer_scraper/requirements.txt new file mode 100644 index 0000000..85ebf67 --- /dev/null +++ b/transfer_scraper/requirements.txt @@ -0,0 +1,15 @@ +attrs==23.2.0 +certifi==2024.2.2 +fake-useragent==1.4.0 +h11==0.14.0 +idna==3.6 +outcome==1.3.0.post0 +PySocks==1.7.1 +selenium==4.18.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +trio==0.24.0 +trio-websocket==0.11.1 +typing_extensions==4.10.0 +urllib3==2.2.1 +wsproto==1.2.0