Fix credit count parsing

This commit is contained in:
powe97 2024-03-07 11:03:59 -06:00
parent 0f3652d8cc
commit 779b979b9b
No known key found for this signature in database
GPG Key ID: 7D1663B10978D1BA
1 changed files with 24 additions and 14 deletions

View File

@ -214,12 +214,16 @@ def scrape_institution(index, page_num):
def parse_course_td(td, note=None):
course_info = (
html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0].split()
)
# This regex removes spaces next to parentheses. For example,
# Calculus II ( 04) -> Calculus II (04)
course_info = re.sub(
"(?<=[\[{(])\s+|\s+(?=[\]})])",
"",
html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0],
).split()
# Not all schools use the same course code format, so this figures out how long
# it is if it exists, it will not exist for Not Transferrable.
# it is if it exists. It will not exist for Not Transferrable and AP tests.
try:
course_id_delim = 1 + list(
bool(re.search(r"\d", s)) for s in course_info
@ -227,21 +231,27 @@ def parse_course_td(td, note=None):
except ValueError:
course_id_delim = 1
# Same deal with credit counts.
# Same deal with credit counts. Fancy logic here to avoid catching course titles
# with parentheses in them which do not have a credit count, this happened 3 times
# This also ignores credit counts with "Variable" in them, but ... you try
try:
cr_delim = (
len(course_info)
- 1
- list(
bool(re.search(r"^\([0-9]", s.strip())) for s in course_info[::-1]
).index(True)
)
assert bool(re.search(r"[0-9]\)", course_info[-1]))
if course_info[-1] == "()":
cr_delim = -1
else:
cr_delim = (
len(course_info)
- 1
- list(
bool(re.search(r"^\([.]*[0-9]", s.strip()))
for s in course_info[::-1]
).index(True)
)
assert bool(re.search(r"[0-9]\)", course_info[-1]))
except (ValueError, AssertionError):
cr_delim = len(course_info)
# note serves as a credit count override, since the RPI-side credit counts
# are inaccurate
# are inaccurate.
out = {
"id": " ".join(course_info[:course_id_delim]),
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),