From 3b608fad410beb20c79c2624c74c5b7e9242e323 Mon Sep 17 00:00:00 2001
From: powe97 <116031952+powe97@users.noreply.github.com>
Date: Fri, 1 Mar 2024 13:32:02 -0500
Subject: [PATCH] Fix Roman numerals issue

---
 transfer_scraper/main.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/transfer_scraper/main.py b/transfer_scraper/main.py
index 6c751c3..1ccd4e0 100644
--- a/transfer_scraper/main.py
+++ b/transfer_scraper/main.py
@@ -21,13 +21,12 @@ def raise_(ex):
     raise ex
 
 
-def normalize_class_name(input):
-    text = list(input)
-    for i in range(1, len(text)):
-        if (text[i - 1] == " ") or (text[i - 1] == text[i] == "I"):
-            continue
-        text[i] = text[i].lower()
-    return "".join(text)
+# Fix course titles accounting for Roman numerals up to X
+def normalize_title(input):
+    s = " ".join(input.split())
+    s = re.sub(r"[A-Zaz]+('[A-Za-z]+)?", lambda m: m.group(0).capitalize(), s)
+    s = re.sub(r"\b(Viii|Vii|Vi|Iv|Ix|Iii|Ii)\b", lambda a: a.group(0).upper(), s)
+    return s.strip()
 
 
 def wait(ec):
@@ -53,12 +52,12 @@ def scrape_course_card(html_id, i, note):
     if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
         course_desc = trs[1].text
 
-    course_department = (
+    course_department = normalize_title(
         next((x for x in trs if x.text.strip().startswith("Department:")))
         .find_elements(By.TAG_NAME, "td")[1]
-        .text.title()
+        .text
     )
-    course_catalog = (
+    course_catalog = normalize_title(
         next((x for x in trs if x.text.strip().startswith("Source catalog:")))
         .find_elements(By.TAG_NAME, "td")[1]
         .text
@@ -69,10 +68,10 @@ def scrape_course_card(html_id, i, note):
             i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v))
         )
         course_id = " ".join(course_name_and_id[0:k])
-        course_name = normalize_class_name(" ".join(course_name_and_id[k:]))
+        course_name = normalize_title(" ".join(course_name_and_id[k:]))
     except StopIteration:  # Handling for Not Transferrable
         course_id = course_name_and_id[0]
-        course_name = normalize_class_name(" ".join(course_name_and_id[1:]))
+        course_name = normalize_title(" ".join(course_name_and_id[1:]))
 
     if not note:
         try:
@@ -202,8 +201,8 @@ def main():
                 fields = institution_link.find_element(By.XPATH, "../..").find_elements(
                     By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
                 )
-                inst_name = institution_link.text.title().strip()
-                city = fields[0].text.title().strip()
+                inst_name = normalize_title(institution_link.text)
+                city = normalize_title(fields[0].text)
                 us_state = fields[1].text.strip()
 
                 institution_link.click()