mirror of
https://github.com/quatalog/quatalog.git
synced 2025-11-29 15:55:55 +00:00
Fix handling for courses that come in as multiple courses
This commit is contained in:
parent
4c0517f6c4
commit
b017436be9
|
|
@ -195,8 +195,9 @@ def scrape_institution(index, page_num):
|
||||||
|
|
||||||
transfer_courses = [
|
transfer_courses = [
|
||||||
{
|
{
|
||||||
"transfer": parse_course_td(transfer_course),
|
"transfer": parse_course_td(transfer_course, True),
|
||||||
"rpi": parse_course_td(rpi_course, note.text.strip()),
|
"rpi": parse_course_td(rpi_course, False),
|
||||||
|
"note": note.text.strip(),
|
||||||
"begin": begin.text.strip(),
|
"begin": begin.text.strip(),
|
||||||
"end": end.text.strip(),
|
"end": end.text.strip(),
|
||||||
}
|
}
|
||||||
|
|
@ -215,15 +216,25 @@ def scrape_institution(index, page_num):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_course_td(td, note=None):
|
# Scrape course entries. We have a switch to disable including credit counts because the
|
||||||
|
# RPI-side credit counts are wrong most of the time and this is clarified in notes.
|
||||||
|
def parse_course_td(td, include_credits):
|
||||||
# This regex removes spaces next to parentheses. For example,
|
# This regex removes spaces next to parentheses. For example,
|
||||||
# Calculus II ( 04) -> Calculus II (04)
|
# Calculus II ( 04) -> Calculus II (04)
|
||||||
course_info = re.sub(
|
td_text = html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")
|
||||||
"(?<=[\[{(])\s+|\s+(?=[\]})])",
|
courses_info = [
|
||||||
"",
|
re.sub(
|
||||||
html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0],
|
"(?<=[\[{(])\s+|\s+(?=[\]})])",
|
||||||
).split()
|
"",
|
||||||
|
x,
|
||||||
|
).split()
|
||||||
|
for x in td_text[: len(td_text) - 3]
|
||||||
|
]
|
||||||
|
|
||||||
|
return [parse_one_course(x, include_credits) for x in courses_info]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_one_course(course_info, include_credits):
|
||||||
# Not all schools use the same course code format, so this figures out how long
|
# Not all schools use the same course code format, so this figures out how long
|
||||||
# it is if it exists. It will not exist for Not Transferrable and AP tests.
|
# it is if it exists. It will not exist for Not Transferrable and AP tests.
|
||||||
try:
|
try:
|
||||||
|
|
@ -257,14 +268,10 @@ def parse_course_td(td, note=None):
|
||||||
out = {
|
out = {
|
||||||
"id": " ".join(course_info[:course_id_delim]),
|
"id": " ".join(course_info[:course_id_delim]),
|
||||||
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
|
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
|
||||||
"catalog": td.find_element(By.TAG_NAME, "span").text,
|
|
||||||
}
|
}
|
||||||
if note is None:
|
if include_credits:
|
||||||
out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1]).strip()}),
|
out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1]).strip()}),
|
||||||
return out
|
return out
|
||||||
else:
|
|
||||||
out.update({"note": note})
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue