mirror of
https://github.com/quatalog/quatalog.git
synced 2024-09-28 20:48:57 +00:00
Fix credit count parsing
This commit is contained in:
parent
0f3652d8cc
commit
779b979b9b
|
@ -214,12 +214,16 @@ def scrape_institution(index, page_num):
|
||||||
|
|
||||||
|
|
||||||
def parse_course_td(td, note=None):
|
def parse_course_td(td, note=None):
|
||||||
course_info = (
|
# This regex removes spaces next to parentheses. For example,
|
||||||
html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0].split()
|
# Calculus II ( 04) -> Calculus II (04)
|
||||||
)
|
course_info = re.sub(
|
||||||
|
"(?<=[\[{(])\s+|\s+(?=[\]})])",
|
||||||
|
"",
|
||||||
|
html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0],
|
||||||
|
).split()
|
||||||
|
|
||||||
# Not all schools use the same course code format, so this figures out how long
|
# Not all schools use the same course code format, so this figures out how long
|
||||||
# it is if it exists, it will not exist for Not Transferrable.
|
# it is if it exists. It will not exist for Not Transferrable and AP tests.
|
||||||
try:
|
try:
|
||||||
course_id_delim = 1 + list(
|
course_id_delim = 1 + list(
|
||||||
bool(re.search(r"\d", s)) for s in course_info
|
bool(re.search(r"\d", s)) for s in course_info
|
||||||
|
@ -227,21 +231,27 @@ def parse_course_td(td, note=None):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
course_id_delim = 1
|
course_id_delim = 1
|
||||||
|
|
||||||
# Same deal with credit counts.
|
# Same deal with credit counts. Fancy logic here to avoid catching course titles
|
||||||
|
# with parentheses in them which do not have a credit count, this happened 3 times
|
||||||
|
# This also ignores credit counts with "Variable" in them, but ... you try
|
||||||
try:
|
try:
|
||||||
cr_delim = (
|
if course_info[-1] == "()":
|
||||||
len(course_info)
|
cr_delim = -1
|
||||||
- 1
|
else:
|
||||||
- list(
|
cr_delim = (
|
||||||
bool(re.search(r"^\([0-9]", s.strip())) for s in course_info[::-1]
|
len(course_info)
|
||||||
).index(True)
|
- 1
|
||||||
)
|
- list(
|
||||||
assert bool(re.search(r"[0-9]\)", course_info[-1]))
|
bool(re.search(r"^\([.]*[0-9]", s.strip()))
|
||||||
|
for s in course_info[::-1]
|
||||||
|
).index(True)
|
||||||
|
)
|
||||||
|
assert bool(re.search(r"[0-9]\)", course_info[-1]))
|
||||||
except (ValueError, AssertionError):
|
except (ValueError, AssertionError):
|
||||||
cr_delim = len(course_info)
|
cr_delim = len(course_info)
|
||||||
|
|
||||||
# note serves as a credit count override, since the RPI-side credit counts
|
# note serves as a credit count override, since the RPI-side credit counts
|
||||||
# are inaccurate
|
# are inaccurate.
|
||||||
out = {
|
out = {
|
||||||
"id": " ".join(course_info[:course_id_delim]),
|
"id": " ".join(course_info[:course_id_delim]),
|
||||||
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
|
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
|
||||||
|
|
Loading…
Reference in a new issue