Re-add catalog name scraping

stderr
Log IP
2024-11-18 04:52:51 +00:00 · 2024-03-13 23:52:51 -04:00 · 2024-03-13 22:19:56 -04:00 · 2024-03-13 22:18:13 -04:00
1 changed files with 14 additions and 5 deletions
--- a/transfer_scraper/main.py
+++ b/transfer_scraper/main.py
@ -93,6 +93,11 @@ def scrape_page(page_num):
    for i in range(1, 4):
        try:
            driver = webdriver.Firefox(options=options)
+            driver.get("https://ipinfo.io/ip")
+            print(
+                f'Trying with IP {driver.find_element(By.TAG_NAME, "body").text}',
+                file=sys.stderr,
+            )
            driver.get(
                "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
            )
@ -102,13 +107,14 @@ def scrape_page(page_num):
            break
        except Exception as e:
            driver.quit()
+
            print(
                f"Attempt {i} failed due to {type(e).__name__}, retrying in 25 seconds...",
                file=sys.stderr,
            )
            sleep(25)
    else:
-        raise Exception(f"Failed to load the main page after 15 attempts, aborting.")
+        raise Exception(f"Failed to load the main page after 4 attempts, aborting.")

    num_institutions = len(
        driver.find_elements(
@ -133,7 +139,7 @@ def scrape_institution_safe(index, page_num):
            )
            sleep(25)
    else:
-        raise Exception(f"Failed to scrape {index} after 15 attempts, aborting.")
+        raise Exception(f"Failed to scrape {index} after 4 attempts, aborting.")


 # scrape_institution: Scrapes an institution by index.
@ -181,7 +187,7 @@ def scrape_institution(index, page_num):
            "institution": inst_name,
            "city": inst_city,
            "state": inst_state,
-            "courses": [],
+            "transfers": [],
        }

    # Open list
@ -212,7 +218,7 @@ def scrape_institution(index, page_num):
        "institution": inst_name,
        "city": inst_city,
        "state": inst_state,
-        "courses": transfer_courses,
+        "transfers": transfer_courses,
    }


@ -231,7 +237,10 @@ def parse_course_td(td, include_credits):
        for x in td_text[: len(td_text) - 3]
    ]

-    return [parse_one_course(x, include_credits) for x in courses_info]
+    return {
+        "catalog": td.find_element(By.TAG_NAME, "span").text.strip(),
+        "courses": [parse_one_course(x, include_credits) for x in courses_info],
+    }


 def parse_one_course(course_info, include_credits):
Author	SHA1	Message	Date
powe97	517952f977	Re-add catalog name scraping	2024-03-13 23:52:51 -04:00
powe97	f1a47dca48	stderr	2024-03-13 22:19:56 -04:00
powe97	af25410c5d	Log IP	2024-03-13 22:18:13 -04:00