Fix credit count parsing

2024-09-28 20:48:57 +00:00 · 2024-03-07 11:03:59 -06:00 · 2024-03-07 11:03:59 -06:00 · 779b979b9b
parent 0f3652d8cc
commit 779b979b9b
1 changed files with 24 additions and 14 deletions
--- a/transfer_scraper/main.py
+++ b/transfer_scraper/main.py
@ -214,12 +214,16 @@ def scrape_institution(index, page_num):
 def parse_course_td(td, note=None):
-    course_info = (
+    # This regex removes spaces next to parentheses. For example,
-        html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0].split()
+    #       Calculus II ( 04) -> Calculus II (04)
-    )
+    course_info = re.sub(
        "(?<=[\[{(])\s+|\s+(?=[\]})])",
        "",
        html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0],
    ).split()
    # Not all schools use the same course code format, so this figures out how long
-    # it is if it exists, it will not exist for Not Transferrable.
+    # it is if it exists. It will not exist for Not Transferrable and AP tests.
    try:
        course_id_delim = 1 + list(
            bool(re.search(r"\d", s)) for s in course_info
@ -227,21 +231,27 @@ def parse_course_td(td, note=None):
    except ValueError:
        course_id_delim = 1
-    # Same deal with credit counts.
+    # Same deal with credit counts. Fancy logic here to avoid catching course titles
    # with parentheses in them which do not have a credit count, this happened 3 times
    # This also ignores credit counts with "Variable" in them, but ... you try
    try:
-        cr_delim = (
+        if course_info[-1] == "()":
-            len(course_info)
+            cr_delim = -1
-            - 1
+        else:
-            - list(
+            cr_delim = (
-                bool(re.search(r"^\([0-9]", s.strip())) for s in course_info[::-1]
+                len(course_info)
-            ).index(True)
+                - 1
-        )
+                - list(
-        assert bool(re.search(r"[0-9]\)", course_info[-1]))
+                    bool(re.search(r"^\([.]*[0-9]", s.strip()))
                    for s in course_info[::-1]
                ).index(True)
            )
            assert bool(re.search(r"[0-9]\)", course_info[-1]))
    except (ValueError, AssertionError):
        cr_delim = len(course_info)
    # note serves as a credit count override, since the RPI-side credit counts
-    # are inaccurate
+    # are inaccurate.
    out = {
        "id": " ".join(course_info[:course_id_delim]),
        "name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),