From 4f69c1d8a0df0fe7d653b29d9641d549f28cedfa Mon Sep 17 00:00:00 2001 From: powe97 <116031952+powe97@users.noreply.github.com> Date: Tue, 5 Mar 2024 21:14:00 -0500 Subject: [PATCH] Re-get the page to try circumvent timeout --- transfer_scraper/main.py | 58 ++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/transfer_scraper/main.py b/transfer_scraper/main.py index 0d7ba5f..d6675bc 100644 --- a/transfer_scraper/main.py +++ b/transfer_scraper/main.py @@ -29,7 +29,7 @@ def wait(ec): global driver WebDriverWait( - driver, 300, ignored_exceptions=[StaleElementReferenceException] + driver, 40, ignored_exceptions=[StaleElementReferenceException] ).until(ec) sleep(random.uniform(400, 1900) / 1000) @@ -51,7 +51,9 @@ def jump_to_page(curr_page, to_page, postback_type, pagination_type): return 1, page if to_page > num_pages or to_page < 1: - raise ValueError(f"to_page was out of range ({to_page} not in [1, {num_pages}])") + raise ValueError( + f"to_page was out of range ({to_page} not in [1, {num_pages}])" + ) while curr_page != to_page: jumpable_pages = { int(x.get_attribute("href").split("'")[3][5:]): x @@ -84,21 +86,39 @@ def jump_to_page(curr_page, to_page, postback_type, pagination_type): # page_num: The page to scrape. # Note that the current page before running this function must be 1. def scrape_page(page_num): + global driver + global options + + driver = webdriver.Firefox(options=options) + driver.get( + "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" + ) jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo") + num_institutions = len( driver.find_elements( By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" ) ) + driver.quit() + print(f"Scraping page {page_num}, found {num_institutions} links", file=sys.stderr) - return [scrape_institution(i) for i in range(0, num_institutions)] + return [scrape_institution(i, page_num) for i in range(0, num_institutions)] # scrape_institution: Scrapes an institution by index. # # index: the 0-indexed index of the instituion to scrape on the page we are on. -def scrape_institution(index): - # Go to institution page +def scrape_institution(index, page_num): + global driver + global options + + driver = webdriver.Firefox(options=options) + driver.get( + "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" + ) + jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo") + inst_link = driver.find_element( By.ID, f"gdvInstWithEQ_btnCreditFromInstName_{index}" ) @@ -155,18 +175,7 @@ def scrape_institution(index): ) ] - # Clear list - tr = driver.find_element(By.ID, "gdvMyCourseEQList").find_element(By.TAG_NAME, "tr") - driver.find_element(By.ID, "btnClearMyList").click() - wait(EC.staleness_of(tr)) - - # Exit My List menu - driver.find_element(By.CSS_SELECTOR, "#udpAddCourseEQToMyList button.close").click() - - # Leave institution page - switch_view = driver.find_element(By.ID, "btnSwitchView") - switch_view.click() - wait(EC.staleness_of(switch_view)) + driver.quit() return { "institution": inst_name, @@ -217,9 +226,13 @@ def parse_course_td(td, note=None): def main(): global driver + global options if len(sys.argv) != 3: - print(f"USAGE: python {sys.argv[0]} ", file=sys.stderr) + print( + f"USAGE: python {sys.argv[0]} ", + file=sys.stderr, + ) return 1 PAGE_NUM_TO_SCRAPE = int(sys.argv[1]) @@ -227,19 +240,12 @@ def main(): print(f"Setting up selenium Firefox emulator", file=sys.stderr) options = webdriver.FirefoxOptions() - options.add_argument("--headless") + # options.add_argument("--headless") user_agent = UserAgent().random options.set_preference("general.useragent.override", user_agent) print(f"Using randomized user agent {user_agent}", file=sys.stderr) - driver = webdriver.Firefox(options=options) - - print(f"Connecting to the TES Public View", file=sys.stderr) - driver.get( - "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" - ) - with open(OUT_FILENAME, "w") as transferjson: json.dump(scrape_page(PAGE_NUM_TO_SCRAPE), transferjson, indent=4)