Fix Roman numerals issue

This commit is contained in:
powe97 2024-03-01 13:32:02 -05:00
parent d03be03aeb
commit 3b608fad41
No known key found for this signature in database
GPG key ID: 7D1663B10978D1BA

View file

@ -21,13 +21,12 @@ def raise_(ex):
raise ex raise ex
def normalize_class_name(input): # Fix course titles accounting for Roman numerals up to X
text = list(input) def normalize_title(input):
for i in range(1, len(text)): s = " ".join(input.split())
if (text[i - 1] == " ") or (text[i - 1] == text[i] == "I"): s = re.sub(r"[A-Zaz]+('[A-Za-z]+)?", lambda m: m.group(0).capitalize(), s)
continue s = re.sub(r"\b(Viii|Vii|Vi|Iv|Ix|Iii|Ii)\b", lambda a: a.group(0).upper(), s)
text[i] = text[i].lower() return s.strip()
return "".join(text)
def wait(ec): def wait(ec):
@ -53,12 +52,12 @@ def scrape_course_card(html_id, i, note):
if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2": if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
course_desc = trs[1].text course_desc = trs[1].text
course_department = ( course_department = normalize_title(
next((x for x in trs if x.text.strip().startswith("Department:"))) next((x for x in trs if x.text.strip().startswith("Department:")))
.find_elements(By.TAG_NAME, "td")[1] .find_elements(By.TAG_NAME, "td")[1]
.text.title() .text
) )
course_catalog = ( course_catalog = normalize_title(
next((x for x in trs if x.text.strip().startswith("Source catalog:"))) next((x for x in trs if x.text.strip().startswith("Source catalog:")))
.find_elements(By.TAG_NAME, "td")[1] .find_elements(By.TAG_NAME, "td")[1]
.text .text
@ -69,10 +68,10 @@ def scrape_course_card(html_id, i, note):
i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v)) i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v))
) )
course_id = " ".join(course_name_and_id[0:k]) course_id = " ".join(course_name_and_id[0:k])
course_name = normalize_class_name(" ".join(course_name_and_id[k:])) course_name = normalize_title(" ".join(course_name_and_id[k:]))
except StopIteration: # Handling for Not Transferrable except StopIteration: # Handling for Not Transferrable
course_id = course_name_and_id[0] course_id = course_name_and_id[0]
course_name = normalize_class_name(" ".join(course_name_and_id[1:])) course_name = normalize_title(" ".join(course_name_and_id[1:]))
if not note: if not note:
try: try:
@ -202,8 +201,8 @@ def main():
fields = institution_link.find_element(By.XPATH, "../..").find_elements( fields = institution_link.find_element(By.XPATH, "../..").find_elements(
By.CSS_SELECTOR, ".gdv_boundfield_uppercase" By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
) )
inst_name = institution_link.text.title().strip() inst_name = normalize_title(institution_link.text)
city = fields[0].text.title().strip() city = normalize_title(fields[0].text)
us_state = fields[1].text.strip() us_state = fields[1].text.strip()
institution_link.click() institution_link.click()