Redesign scraper to not be unbearably slow

2024-09-27 20:18:47 +00:00 · 2024-03-05 18:33:54 -05:00 · 2024-03-05 18:33:54 -05:00 · 6ad6f85708
parent 976b553b14
commit 6ad6f85708
1 changed files with 159 additions and 295 deletions
--- a/transfer_scraper/main.py
+++ b/transfer_scraper/main.py
@ -3,22 +3,17 @@ import html
 import sys
 import re
 import os.path
-import traceback
 from time import sleep
 import random
-from signal import alarm, SIGALRM, signal
 from fake_useragent import UserAgent
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import StaleElementReferenceException
-from selenium.common.exceptions import TimeoutException
-from selenium.common.exceptions import NoSuchElementException
-
-
-def raise_(ex):
-    raise ex
+from selenium.common.exceptions import (
+    StaleElementReferenceException,
+    NoSuchElementException,
+)


 # Fix course titles accounting for Roman numerals up to X
@ -29,84 +24,34 @@ def normalize_title(input):
    return s.strip()


+# Waits until EC plus some random wait time
 def wait(ec):
    global driver

    WebDriverWait(
-        driver, 20, ignored_exceptions=[StaleElementReferenceException]
+        driver, 60, ignored_exceptions=[StaleElementReferenceException]
    ).until(ec)
    sleep(random.uniform(400, 1900) / 1000)


-def scrape_course_card(html_id, i, note):
+# jump_to_page: navigates to a paginated page on this insufferable website
+#
+# curr_page: the current page number
+# to_page: the page number to jump to
+# num_pages: the total number of pages
+# postback_type: javascript:__doPostBack('<this field>','Page$3')
+# pagination_type: <span id="<this field>">PAGE 1 OF 27<br></span>
+def jump_to_page(curr_page, to_page, postback_type, pagination_type):
    global driver

-    trs = (
-        driver.find_element("id", html_id)
-        .find_elements(By.CSS_SELECTOR, ".course-detail")[i]
-        .find_elements(By.TAG_NAME, "tr")
-    )
-    course_name_and_id = trs[0].text.split()
-
-    course_desc = ""
-    if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
-        course_desc = trs[1].text
-
-    course_department = normalize_title(
-        next((x for x in trs if x.text.strip().startswith("Department:")))
-        .find_elements(By.TAG_NAME, "td")[1]
-        .text
-    )
-    course_catalog = normalize_title(
-        next((x for x in trs if x.text.strip().startswith("Source catalog:")))
-        .find_elements(By.TAG_NAME, "td")[1]
-        .text
-    )
-
+    page = driver.find_element(By.ID, postback_type)
    try:
-        k = 1 + next(
-            i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v))
-        )
-        course_id = " ".join(course_name_and_id[0:k])
-        course_name = normalize_title(" ".join(course_name_and_id[k:]))
-    except StopIteration:  # Handling for Not Transferrable
-        course_id = course_name_and_id[0]
-        course_name = normalize_title(" ".join(course_name_and_id[1:]))
-
-    if not note:
-        try:
-            course_credits = (
-                next((x for x in trs if x.text.strip().startswith("Units:")))
-                .find_elements(By.TAG_NAME, "td")[1]
-                .text.strip()
-            )
-        except:
-            course_credits = ""
-
-        return {
-            "id": course_id,
-            "name": course_name,
-            "credits": course_credits,
-            "desc": course_desc,
-            "department": course_department,
-            "catalog": course_catalog,
-        }
-    else:
-        course_note = driver.find_element("id", "lblCommentsPublic").text.strip()
-        return {
-            "id": course_id,
-            "name": course_name,
-            "note": course_note,
-            "desc": course_desc,
-            "department": course_department,
-            "catalog": course_catalog,
-        }
-
-
-def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
-    page = driver.find_element("id", postback_type)
-    if num_pages == 1:
+        num_pages = int(driver.find_element(By.ID, pagination_type).text.split()[-1])
+    except NoSuchElementException:
        return 1, page
+
+    if to_page > num_pages or to_page < 1:
+        raise ValueError(f"to_page was out of range ({to_page} not in [1, {num_pages})")
    while curr_page != to_page:
        jumpable_pages = {
            int(x.get_attribute("href").split("'")[3][5:]): x
@ -117,7 +62,7 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
                + """','Page$"]""",
            )
        }
-        curr_page = int(driver.find_element("id", pagination_type).text.split()[-3])
+        curr_page = int(driver.find_element(By.ID, pagination_type).text.split()[-3])
        if to_page in jumpable_pages:
            jumpable_pages[to_page].click()
            curr_page = to_page
@ -131,247 +76,166 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):

        wait(EC.staleness_of(page))
        sleep(random.uniform(400, 1900) / 1000)
-        page = driver.find_element("id", postback_type)
+        page = driver.find_element(By.ID, postback_type)
    return curr_page, page


+# scrape_page: Scrapes a page of institutions
+#
+# page_num: The page to scrape.
+# Note that the current page before running this function must be 1.
+def scrape_page(page_num):
+    jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
+    num_institutions = len(
+        driver.find_elements(
+            By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
+        )
+    )
+    print(f"Scraping page {page_num}, found {num_institutions} links")
+    return [scrape_institution(i) for i in range(0, num_institutions)]
+
+
+# scrape_institution: Scrapes an institution by index.
+#
+# index: the 0-indexed index of the instituion to scrape on the page we are on.
+def scrape_institution(index):
+    # Go to institution page
+    inst_link = driver.find_element(
+        By.ID, f"gdvInstWithEQ_btnCreditFromInstName_{index}"
+    )
+    [inst_name, inst_city, inst_state, _] = [
+        e.text
+        for e in inst_link.find_element(By.XPATH, "../..").find_elements(
+            By.TAG_NAME, "td"
+        )
+    ]
+    inst_name, inst_city = normalize_title(inst_name), normalize_title(inst_city)
+    inst_link.click()
+    wait(EC.staleness_of(inst_link))
+    print(f"Scraping {inst_name} ({inst_city}, {inst_state})")
+
+    # Add all courses
+    try:
+        num_pages = int(
+            driver.find_element(By.ID, "lblCourseEQPaginationInfo").text.split()[-1]
+        )
+    except NoSuchElementException:
+        num_pages = 1
+    for i in range(1, num_pages + 1):
+        jump_to_page(max(1, i - 1), i, "gdvCourseEQ", "lblCourseEQPaginationInfo")
+        driver.find_element(By.ID, "gdvCourseEQ_cbxHeaderCheckAll").click()
+
+    # Open list
+    driver.find_element(By.ID, "btnAddToMyEQList").click()
+    wait(EC.visibility_of_element_located((By.ID, "gdvMyCourseEQList")))
+
+    # Scrape list
+    tds = driver.find_element(By.ID, "gdvMyCourseEQList").find_elements(
+        By.TAG_NAME, "td"
+    )
+
+    transfer_courses = [
+        {
+            "transfer": parse_course_td(transfer_course),
+            "rpi": parse_course_td(rpi_course, note.text.strip()),
+            "begin": begin.text.strip(),
+            "end": end.text.strip(),
+        }
+        for transfer_course, rpi_course, note, begin, end, _ in zip(
+            *[iter(x for x in tds)] * 6
+        )
+    ]
+
+    # Clear list
+    tr = driver.find_element(By.ID, "gdvMyCourseEQList").find_element(By.TAG_NAME, "tr")
+    driver.find_element(By.ID, "btnClearMyList").click()
+    wait(EC.staleness_of(tr))
+
+    # Exit My List menu
+    driver.find_element(By.CSS_SELECTOR, "#udpAddCourseEQToMyList button.close").click()
+
+    # Leave institution page
+    switch_view = driver.find_element(By.ID, "btnSwitchView")
+    switch_view.click()
+    wait(EC.staleness_of(switch_view))
+
+    return {
+        "institution": inst_name,
+        "city": inst_city,
+        "state": inst_state,
+        "courses": transfer_courses,
+    }
+
+
+def parse_course_td(td, note=None):
+    course_info = (
+        html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0].split()
+    )
+
+    # Not all schools use the same course code format, so this figures out how long
+    # it is if it exists, it will not exist for Not Transferrable.
+    try:
+        course_id_delim = 1 + list(
+            bool(re.search(r"\d", s)) for s in course_info
+        ).index(True)
+    except ValueError:
+        course_id_delim = 1
+
+    # Same deal with credit counts.
+    try:
+        cr_delim = (
+            len(course_info)
+            - 1
+            - list(bool(re.search(r"\(", s)) for s in course_info[::-1]).index(True)
+        )
+    except ValueError:
+        cr_delim = len(course_info)
+
+    # note serves as a credit count override, since the RPI-side credit counts
+    # are inaccurate
+    out = {
+        "id": " ".join(course_info[:course_id_delim]),
+        "name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
+        "catalog": td.find_element(By.TAG_NAME, "span").text,
+    }
+    if note is None:
+        out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1])}),
+        return out
+    else:
+        out.update({"note": note})
+        return out
+
+
 def main():
    global driver

-    if len(sys.argv) != 3 and len(sys.argv) != 4:
-        print(
-            f"USAGE: python {sys.argv[0]} <transfer file> <state file> [timeout minutes]"
-        )
-        exit(1)
+    if len(sys.argv) != 3:
+        print(f"USAGE: python {sys.argv[0]} <page number to scrape> <output file>")
+        return 1

-    transfer_json_path = sys.argv[1]
-    state_json_path = sys.argv[2]
-    timeout_seconds = int(sys.argv[3] if len(sys.argv) == 4 else 120) * 60
-
-    # Set up timeout so that the GH action does not run forever, pretend it's ^C
-    print(f"Setting timeout to {timeout_seconds} seconds", file=sys.stderr)
-    signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
-    alarm(timeout_seconds)
+    PAGE_NUM_TO_SCRAPE = int(sys.argv[1])
+    OUT_FILENAME = sys.argv[2]

+    print(f"Setting up selenium Firefox emulator")
    options = webdriver.FirefoxOptions()
    options.add_argument("--headless")

    user_agent = UserAgent().random
    options.set_preference("general.useragent.override", user_agent)
-    # options.set_preference("network.proxy.socks", "")
-    # options.set_preference("network.proxy.socks_port", )
-    # options.set_preference("network.proxy.socks_remote_dns", True)
-    # options.set_preference("network.proxy.type", 1)
    print(f"Using randomized user agent {user_agent}", file=sys.stderr)

    driver = webdriver.Firefox(options=options)
+
+    print(f"Connecting to the TES Public View")
    driver.get(
        "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
    )

-    print(
-        f'Title is {driver.find_element(By.TAG_NAME, "title").get_attribute("innerText").strip()}',
-        file=sys.stderr,
-    )
+    with open("transfer.json", "w") as transferjson:
+        json.dump(scrape_page(PAGE_NUM_TO_SCRAPE), transferjson, indent=4)

-    num_pages = int(
-        driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1]
-    )
-    print(f"{num_pages} pages detected", file=sys.stderr)
-
-    state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0}
-    institutions = {}
-    if os.path.isfile(state_json_path):
-        with open(state_json_path, "r") as statejson:
-            state = json.load(statejson)
-    if os.path.isfile(transfer_json_path):
-        with open(transfer_json_path, "r") as transferjson:
-            institutions = json.load(transferjson)
-
-    print("Loaded state: ", end="", file=sys.stderr)
-    json.dump(state, sys.stderr, indent=4)
-    print("", file=sys.stderr)
-
-    if state["inst_pg"] > num_pages:
-        raise Exception
-
-    try:
-        curr_inst_page = 1
-        while state["inst_pg"] <= num_pages:
-            curr_inst_page, page = jump_to_page(
-                curr_inst_page,
-                state["inst_pg"],
-                num_pages,
-                "gdvInstWithEQ",
-                "lblInstWithEQPaginationInfo",
-            )
-
-            inst_list_len = len(
-                page.find_elements(
-                    By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
-                )
-            )
-
-            while state["inst_idx"] < inst_list_len:
-                institution_link = driver.find_element(
-                    "id", "gdvInstWithEQ"
-                ).find_elements(
-                    By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
-                )[
-                    state["inst_idx"]
-                ]
-                fields = institution_link.find_element(By.XPATH, "../..").find_elements(
-                    By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
-                )
-                inst_name = normalize_title(institution_link.text)
-                city = normalize_title(fields[0].text)
-                us_state = fields[1].text.strip()
-
-                institution_link.click()
-                wait(EC.staleness_of(institution_link))
-
-                try:
-                    course_pages_len = int(
-                        driver.find_element(
-                            "id", "lblCourseEQPaginationInfo"
-                        ).text.split()[-1]
-                    )
-                except NoSuchElementException:
-                    course_pages_len = 1
-
-                try:
-                    courses = institutions[inst_name]["courses"]
-                except Exception:
-                    courses = []
-
-                curr_course_page = 1
-                while state["course_pg"] <= course_pages_len:
-                    curr_course_page, page = jump_to_page(
-                        curr_course_page,
-                        state["course_pg"],
-                        course_pages_len,
-                        "gdvCourseEQ",
-                        "lblCourseEQPaginationInfo",
-                    )
-
-                    course_links_len = len(
-                        page.find_elements(
-                            By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
-                        )
-                    )
-
-                    while state["course_idx"] < course_links_len:
-                        course_link = driver.find_element(
-                            "id", "gdvCourseEQ"
-                        ).find_elements(
-                            By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
-                        )[
-                            state["course_idx"]
-                        ]
-                        course_link.click()
-
-                        try:
-                            wait(
-                                EC.element_to_be_clickable(
-                                    (By.CSS_SELECTOR, ".modal-header button")
-                                ),
-                            )
-
-                            transfer = [
-                                scrape_course_card("lblSendCourseEQDetail", i, False)
-                                for i in range(
-                                    0,
-                                    len(
-                                        driver.find_element(
-                                            "id", "lblSendCourseEQDetail"
-                                        ).find_elements(
-                                            By.CSS_SELECTOR, ".course-detail"
-                                        )
-                                    ),
-                                )
-                            ]
-
-                            rpi = [
-                                scrape_course_card("lblReceiveCourseEQDetail", i, True)
-                                for i in range(
-                                    0,
-                                    len(
-                                        driver.find_element(
-                                            "id", "lblReceiveCourseEQDetail"
-                                        ).find_elements(
-                                            By.CSS_SELECTOR, ".course-detail"
-                                        )
-                                    ),
-                                )
-                            ]
-
-                            print(
-                                f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})",
-                                file=sys.stderr,
-                            )
-
-                            begin_date = driver.find_element(
-                                "id", "lblBeginEffectiveDate"
-                            ).text
-                            end_date = driver.find_element(
-                                "id", "lblEndEffectiveDate"
-                            ).text
-
-                            driver.find_element(
-                                By.CSS_SELECTOR, ".modal-header button"
-                            ).click()
-
-                            courses += [
-                                {
-                                    "transfer": transfer,
-                                    "rpi": rpi,
-                                    "begin": begin_date,
-                                    "end": end_date,
-                                }
-                            ]
-                            state["course_idx"] += 1
-                        except (Exception, KeyboardInterrupt) as e:
-                            institutions.update(
-                                {
-                                    inst_name: {
-                                        "city": city,
-                                        "state": us_state,
-                                        "courses": courses,
-                                    }
-                                }
-                            )
-                            raise e
-                    state["course_idx"] = 0
-                    state["course_pg"] += 1
-
-                institutions.update(
-                    {inst_name: {"city": city, "state": us_state, "courses": courses}}
-                )
-                state["course_pg"] = 1
-                state["inst_idx"] += 1
-
-                driver.find_element("id", "btnSwitchView").click()
-                wait(
-                    EC.text_to_be_present_in_element(
-                        ("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
-                    ),
-                )
-            state["inst_idx"] = 0
-            state["inst_pg"] += 1
-
-    except (Exception, KeyboardInterrupt) as e:
-        print("Program hits exception and will save and terminate", file=sys.stderr)
-        print(traceback.format_exc(), file=sys.stderr)
-
-    print("Program will terminate with state: ", end="", file=sys.stderr)
-    json.dump(state, sys.stderr, indent=4)
-    print("", file=sys.stderr)
-    with open(transfer_json_path, "w") as transferjson:
-        json.dump(institutions, transferjson, indent=4)
-    with open(state_json_path, "w") as statejson:
-        json.dump(state, statejson, indent=4)
    driver.quit()


 if __name__ == "__main__":
-    main()
+    exit(main())