diff --git a/transfer_scraper/main.py b/transfer_scraper/main.py
index 99298fd..3b33507 100644
--- a/transfer_scraper/main.py
+++ b/transfer_scraper/main.py
@@ -195,8 +195,9 @@ def scrape_institution(index, page_num):
transfer_courses = [
{
- "transfer": parse_course_td(transfer_course),
- "rpi": parse_course_td(rpi_course, note.text.strip()),
+ "transfer": parse_course_td(transfer_course, True),
+ "rpi": parse_course_td(rpi_course, False),
+ "note": note.text.strip(),
"begin": begin.text.strip(),
"end": end.text.strip(),
}
@@ -215,15 +216,25 @@ def scrape_institution(index, page_num):
}
-def parse_course_td(td, note=None):
+# Scrape course entries. We have a switch to disable including credit counts because the
+# RPI-side credit counts are wrong most of the time and this is clarified in notes.
+def parse_course_td(td, include_credits):
# This regex removes spaces next to parentheses. For example,
# Calculus II ( 04) -> Calculus II (04)
- course_info = re.sub(
- "(?<=[\[{(])\s+|\s+(?=[\]})])",
- "",
- html.unescape(td.get_attribute("innerHTML")).strip().split("
")[0],
- ).split()
+ td_text = html.unescape(td.get_attribute("innerHTML")).strip().split("
")
+ courses_info = [
+ re.sub(
+ "(?<=[\[{(])\s+|\s+(?=[\]})])",
+ "",
+ x,
+ ).split()
+ for x in td_text[: len(td_text) - 3]
+ ]
+ return [parse_one_course(x, include_credits) for x in courses_info]
+
+
+def parse_one_course(course_info, include_credits):
# Not all schools use the same course code format, so this figures out how long
# it is if it exists. It will not exist for Not Transferrable and AP tests.
try:
@@ -257,14 +268,10 @@ def parse_course_td(td, note=None):
out = {
"id": " ".join(course_info[:course_id_delim]),
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
- "catalog": td.find_element(By.TAG_NAME, "span").text,
}
- if note is None:
+ if include_credits:
out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1]).strip()}),
- return out
- else:
- out.update({"note": note})
- return out
+ return out
def main():