Fix bug where only 1 page is scraped per school and refactor

Fix issue where only 1 page per school would get scraped properly
Fix capitalization next to smart apostrophes (really?)
2024-03-01 20:32:00 -05:00 · 2024-03-01 18:17:53 -05:00 · 2024-03-01 17:21:45 -05:00 · 2024-03-01 15:01:20 -05:00 · 2024-03-01 13:32:09 -05:00 · 2024-03-01 13:32:02 -05:00
2 changed files with 76 additions and 50 deletions
--- a/.github/workflows/transfer.yml
+++ b/.github/workflows/transfer.yml
@ -1,5 +1,7 @@
 name: Scrape transfer and update file
 run-name: Scrape transfer and update file
+env:
+  DEFAULT_TIMEOUT: 45
 on:
  # schedule:
  # - cron: '*/15 * * * *'
@ -11,7 +13,7 @@ on:
        description: "Timeout time"
        required: true
        type: number
-        default: 120
+        default: 2
 concurrency:
  group: transfer-scraper

@ -52,7 +54,7 @@ jobs:
          mkdir new-data
          rsync -avzh data/transfer.json new-data
          rsync -avzh data/transfer_state.json new-data
-          python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout }}
+          python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout || env.DEFAULT_TIMEOUT }}

      - name: Upload data to artifact
        uses: actions/upload-artifact@v4
--- a/transfer_scraper/main.py
+++ b/transfer_scraper/main.py
@ -21,13 +21,12 @@ def raise_(ex):
    raise ex


-def normalize_class_name(input):
-    text = list(input)
-    for i in range(1, len(text)):
-        if (text[i - 1] == " ") or (text[i - 1] == text[i] == "I"):
-            continue
-        text[i] = text[i].lower()
-    return "".join(text)
+# Fix course titles accounting for Roman numerals up to X
+def normalize_title(input):
+    s = " ".join(input.split())
+    s = re.sub(r"[A-Za-z]+(['‘’][A-Za-z]+)?", lambda m: m.group(0).capitalize(), s)
+    s = re.sub(r"\b(Viii|Vii|Vi|Iv|Ix|Iii|Ii)\b", lambda a: a.group(0).upper(), s)
+    return s.strip()


 def wait(ec):
@ -53,12 +52,12 @@ def scrape_course_card(html_id, i, note):
    if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
        course_desc = trs[1].text

-    course_department = (
+    course_department = normalize_title(
        next((x for x in trs if x.text.strip().startswith("Department:")))
        .find_elements(By.TAG_NAME, "td")[1]
-        .text.title()
+        .text
    )
-    course_catalog = (
+    course_catalog = normalize_title(
        next((x for x in trs if x.text.strip().startswith("Source catalog:")))
        .find_elements(By.TAG_NAME, "td")[1]
        .text
@ -69,10 +68,10 @@ def scrape_course_card(html_id, i, note):
            i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v))
        )
        course_id = " ".join(course_name_and_id[0:k])
-        course_name = normalize_class_name(" ".join(course_name_and_id[k:]))
+        course_name = normalize_title(" ".join(course_name_and_id[k:]))
    except StopIteration:  # Handling for Not Transferrable
        course_id = course_name_and_id[0]
-        course_name = normalize_class_name(" ".join(course_name_and_id[1:]))
+        course_name = normalize_title(" ".join(course_name_and_id[1:]))

    if not note:
        try:
@ -104,6 +103,38 @@ def scrape_course_card(html_id, i, note):
        }


+def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
+    page = driver.find_element("id", postback_type)
+    if num_pages == 1:
+        return 1, page
+    while curr_page != to_page:
+        jumpable_pages = {
+            int(x.get_attribute("href").split("'")[3][5:]): x
+            for x in driver.find_elements(
+                By.CSS_SELECTOR,
+                """a[href^="javascript:__doPostBack('"""
+                + postback_type
+                + """','Page$"]""",
+            )
+        }
+        curr_page = int(driver.find_element("id", pagination_type).text.split()[-3])
+        if to_page in jumpable_pages:
+            jumpable_pages[to_page].click()
+            curr_page = to_page
+        elif to_page < min(jumpable_pages):
+            jumpable_pages[min(jumpable_pages)].click()
+            curr_page = min(jumpable_pages)
+        else:
+            jumpable_pages[max(jumpable_pages)].click()
+            curr_page = max(jumpable_pages)
+        print(f"Jumping to {postback_type} page {curr_page}", file=sys.stderr)
+
+        wait(EC.staleness_of(page))
+        sleep(random.uniform(3, 6))
+        page = driver.find_element("id", postback_type)
+    return curr_page, page
+
+
 def main():
    global driver

@ -127,6 +158,10 @@ def main():

    user_agent = UserAgent().random
    options.set_preference("general.useragent.override", user_agent)
+    # options.set_preference("network.proxy.socks", "")
+    # options.set_preference("network.proxy.socks_port", )
+    # options.set_preference("network.proxy.socks_remote_dns", True)
+    # options.set_preference("network.proxy.type", 1)
    print(f"Using randomized user agent {user_agent}", file=sys.stderr)

    driver = webdriver.Firefox(options=options)
@ -153,44 +188,22 @@ def main():
    print("", file=sys.stderr)

    try:
-        curr_page = 1
+        curr_inst_page = 1
        while state["inst_pg"] <= num_pages:
-            page = driver.find_element("id", f"gdvInstWithEQ")
-
-            if state["inst_pg"] != 1:
-                while curr_page != state["inst_pg"]:
-                    jumpable_pages = {
-                        int(x.get_attribute("href").split("'")[3][5:]): x
-                        for x in driver.find_elements(
-                            By.CSS_SELECTOR,
-                            """a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""",
-                        )
-                    }
-                    curr_page = int(
-                        driver.find_element(
-                            "id", "lblInstWithEQPaginationInfo"
-                        ).text.split()[-3]
-                    )
-                    if state["inst_pg"] in jumpable_pages:
-                        jumpable_pages[state["inst_pg"]].click()
-                        curr_page = state["inst_pg"]
-                    elif state["inst_pg"] < min(jumpable_pages):
-                        jumpable_pages[min(jumpable_pages)].click()
-                        curr_page = min(jumpable_pages)
-                    else:
-                        jumpable_pages[max(jumpable_pages)].click()
-                        curr_page = max(jumpable_pages)
-                    print(f"Jumping to institution page {curr_page}", file=sys.stderr)
-
-                    wait(EC.staleness_of(page))
-                    sleep(random.uniform(3, 6))
-                    page = driver.find_element("id", f"gdvInstWithEQ")
+            curr_inst_page, page = jump_to_page(
+                curr_inst_page,
+                state["inst_pg"],
+                num_pages,
+                "gdvInstWithEQ",
+                "lblInstWithEQPaginationInfo",
+            )

            inst_list_len = len(
                page.find_elements(
                    By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
                )
            )
+
            while state["inst_idx"] < inst_list_len:
                institution_link = driver.find_element(
                    "id", "gdvInstWithEQ"
@ -202,8 +215,8 @@ def main():
                fields = institution_link.find_element(By.XPATH, "../..").find_elements(
                    By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
                )
-                inst_name = institution_link.text.title().strip()
-                city = fields[0].text.title().strip()
+                inst_name = normalize_title(institution_link.text)
+                city = normalize_title(fields[0].text)
                us_state = fields[1].text.strip()

                institution_link.click()
@ -212,7 +225,7 @@ def main():
                try:
                    course_pages_len = int(
                        driver.find_element(
-                            "id", "lblInstWithEQPaginationInfo"
+                            "id", "lblCourseEQPaginationInfo"
                        ).text.split()[-1]
                    )
                except NoSuchElementException:
@ -220,15 +233,25 @@ def main():

                try:
                    courses = institutions[inst_name]["courses"]
-                except:
+                except Exception:
                    courses = []

+                curr_course_page = 1
                while state["course_pg"] <= course_pages_len:
+                    curr_course_page, page = jump_to_page(
+                        curr_course_page,
+                        state["course_pg"],
+                        course_pages_len,
+                        "gdvCourseEQ",
+                        "lblCourseEQPaginationInfo",
+                    )
+
                    course_links_len = len(
-                        driver.find_element("id", "gdvCourseEQ").find_elements(
+                        page.find_elements(
                            By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
                        )
                    )
+
                    while state["course_idx"] < course_links_len:
                        course_link = driver.find_element(
                            "id", "gdvCourseEQ"
@ -312,6 +335,7 @@ def main():
                            raise e
                    state["course_idx"] = 0
                    state["course_pg"] += 1
+
                institutions.update(
                    {inst_name: {"city": city, "state": us_state, "courses": courses}}
                )
Author	SHA1	Message	Date
powe97	30f4f49cdb	Fix bug where only 1 page is scraped per school and refactor	2024-03-01 20:32:00 -05:00
powe97	baa74b8ee6	Fix issue where only 1 page per school would get scraped properly	2024-03-01 18:17:53 -05:00
powe97	5ea6816c90	Fix capitalization next to smart apostrophes (really?)	2024-03-01 17:21:45 -05:00
powe97	6b5356c84f	Fix typo leading to bad capitalization	2024-03-01 15:01:20 -05:00
powe97	001825d3dc	Merge branch 'main' of https://github.com/quatalog/quatalog	2024-03-01 13:32:09 -05:00
powe97	3b608fad41	Fix Roman numerals issue	2024-03-01 13:32:02 -05:00
powe97	5fe4ee9f13	Change manually run workflow to have timeout of 2 mins	2024-03-01 13:12:23 -05:00
powe97	997d3c16a8	2hr timeout → 45m	2024-03-01 12:48:20 -05:00