diff --git a/transfer_scraper/main.py b/transfer_scraper/main.py
index 936e626..f118b31 100644
--- a/transfer_scraper/main.py
+++ b/transfer_scraper/main.py
@@ -214,12 +214,16 @@ def scrape_institution(index, page_num):
def parse_course_td(td, note=None):
- course_info = (
- html.unescape(td.get_attribute("innerHTML")).strip().split("
")[0].split()
- )
+ # This regex removes spaces next to parentheses. For example,
+ # Calculus II ( 04) -> Calculus II (04)
+ course_info = re.sub(
+ "(?<=[\[{(])\s+|\s+(?=[\]})])",
+ "",
+ html.unescape(td.get_attribute("innerHTML")).strip().split("
")[0],
+ ).split()
# Not all schools use the same course code format, so this figures out how long
- # it is if it exists, it will not exist for Not Transferrable.
+ # it is if it exists. It will not exist for Not Transferrable and AP tests.
try:
course_id_delim = 1 + list(
bool(re.search(r"\d", s)) for s in course_info
@@ -227,21 +231,27 @@ def parse_course_td(td, note=None):
except ValueError:
course_id_delim = 1
- # Same deal with credit counts.
+ # Same deal with credit counts. Fancy logic here to avoid catching course titles
+ # with parentheses in them which do not have a credit count, this happened 3 times
+ # This also ignores credit counts with "Variable" in them, but ... you try
try:
- cr_delim = (
- len(course_info)
- - 1
- - list(
- bool(re.search(r"^\([0-9]", s.strip())) for s in course_info[::-1]
- ).index(True)
- )
- assert bool(re.search(r"[0-9]\)", course_info[-1]))
+ if course_info[-1] == "()":
+ cr_delim = -1
+ else:
+ cr_delim = (
+ len(course_info)
+ - 1
+ - list(
+ bool(re.search(r"^\([.]*[0-9]", s.strip()))
+ for s in course_info[::-1]
+ ).index(True)
+ )
+ assert bool(re.search(r"[0-9]\)", course_info[-1]))
except (ValueError, AssertionError):
cr_delim = len(course_info)
# note serves as a credit count override, since the RPI-side credit counts
- # are inaccurate
+ # are inaccurate.
out = {
"id": " ".join(course_info[:course_id_delim]),
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),