From f216c457486abb6b872e40f535e3ef0fcf533358 Mon Sep 17 00:00:00 2001 From: powe97 <116031952+powe97@users.noreply.github.com> Date: Thu, 29 Feb 2024 20:49:45 -0500 Subject: [PATCH] Add if __name__ == "__main__" and fix workflow --- .github/workflows/transfer.yml | 7 +- transfer_scraper/main.py | 419 +++++++++++++++++---------------- 2 files changed, 221 insertions(+), 205 deletions(-) diff --git a/.github/workflows/transfer.yml b/.github/workflows/transfer.yml index 0d9e8dc..609450b 100644 --- a/.github/workflows/transfer.yml +++ b/.github/workflows/transfer.yml @@ -29,14 +29,17 @@ jobs: - name: Install dependencies working-directory: quatalog-scraping/transfer_scraper - run: pip install -r 'requirements.txt' + run: | + python -m pip install --upgrade pip + pip install -r 'requirements.txt' - name: Log IP run: | echo "Public IP: $(curl -s 'https://ipinfo.io/ip')" - name: Scrape transfer guide - run: python3 quatalog-scraping/transfer_scraper data/transfer.json data/transfer_state.json + run: | + python3 quatalog-scraping/transfer_scraper/main.py data/transfer.json data/transfer_state.json - name: Upload data to artifact uses: actions/upload-artifact@v4 diff --git a/transfer_scraper/main.py b/transfer_scraper/main.py index bbcf81d..770a3da 100644 --- a/transfer_scraper/main.py +++ b/transfer_scraper/main.py @@ -100,222 +100,235 @@ def scrape_course_card(html_id, i, note): } -if len(sys.argv) != 3: - print(f"USAGE: python {sys.argv[0]} ") - exit(1) +def main(): + if len(sys.argv) != 3: + print(f"USAGE: python {sys.argv[0]} ") + exit(1) -transfer_json_path = sys.argv[1] -state_json_path = sys.argv[2] + transfer_json_path = sys.argv[1] + state_json_path = sys.argv[2] -options = webdriver.FirefoxOptions() -user_agent = UserAgent().random -print(f"Using randomized user agent {user_agent}", file=sys.stderr) -if sys.argv[-1] != "gui": - options.add_argument("--headless") -options.set_preference("general.useragent.override", user_agent) -driver = webdriver.Firefox(options=options) -driver.get( - "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" -) + options = webdriver.FirefoxOptions() + user_agent = UserAgent().random + print(f"Using randomized user agent {user_agent}", file=sys.stderr) + if sys.argv[-1] != "gui": + options.add_argument("--headless") + options.set_preference("general.useragent.override", user_agent) + driver = webdriver.Firefox(options=options) + driver.get( + "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" + ) -num_pages = int( - driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1] -) -print(f"{num_pages} pages detected", file=sys.stderr) + num_pages = int( + driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1] + ) + print(f"{num_pages} pages detected", file=sys.stderr) -state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0} -institutions = {} -if os.path.isfile(state_json_path): - with open(state_json_path, "r") as statejson: - state = json.load(statejson) -if os.path.isfile(transfer_json_path): - with open(transfer_json_path, "r") as transferjson: - institutions = json.load(transferjson) + state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0} + institutions = {} + if os.path.isfile(state_json_path): + with open(state_json_path, "r") as statejson: + state = json.load(statejson) + if os.path.isfile(transfer_json_path): + with open(transfer_json_path, "r") as transferjson: + institutions = json.load(transferjson) -print("Loaded state: ", end="", file=sys.stderr) -json.dump(state, sys.stderr, indent=4) -print("", file=sys.stderr) + print("Loaded state: ", end="", file=sys.stderr) + json.dump(state, sys.stderr, indent=4) + print("", file=sys.stderr) + # Set up 2hr timeout so that the GH action does not run forever, pretend it's ^C + signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt)) + alarm(60 * 60 * 2) -# Set up 2hr timeout so that the GH action does not run forever, pretend it's ^C -signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt)) -alarm(60 * 60 * 2) + try: + curr_page = 1 + while state["inst_pg"] <= num_pages: + page = driver.find_element("id", f"gdvInstWithEQ") - -try: - curr_page = 1 - while state["inst_pg"] <= num_pages: - page = driver.find_element("id", f"gdvInstWithEQ") - - if state["inst_pg"] != 1: - while curr_page != state["inst_pg"]: - print(f"Jumping to institution page {curr_page}", file=sys.stderr) - jumpable_pages = { - int(x.get_attribute("href").split("'")[3][5:]): x - for x in driver.find_elements( - By.CSS_SELECTOR, - """a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""", - ) - } - curr_page = int( - driver.find_element( - "id", "lblInstWithEQPaginationInfo" - ).text.split()[-3] - ) - if state["inst_pg"] in jumpable_pages: - jumpable_pages[state["inst_pg"]].click() - curr_page = state["inst_pg"] - elif state["inst_pg"] < min(jumpable_pages): - jumpable_pages[min(jumpable_pages)].click() - curr_page = min(jumpable_pages) - else: - jumpable_pages[max(jumpable_pages)].click() - curr_page = max(jumpable_pages) - - wait(EC.staleness_of(page)) - sleep(random.uniform(3, 6)) - page = driver.find_element("id", f"gdvInstWithEQ") - - inst_list_len = len( - page.find_elements( - By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" - ) - ) - while state["inst_idx"] < inst_list_len: - institution_link = driver.find_element("id", "gdvInstWithEQ").find_elements( - By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" - )[state["inst_idx"]] - fields = institution_link.find_element(By.XPATH, "../..").find_elements( - By.CSS_SELECTOR, ".gdv_boundfield_uppercase" - ) - inst_name = institution_link.text.title().strip() - city = fields[0].text.title().strip() - us_state = fields[1].text.strip() - - institution_link.click() - wait(EC.staleness_of(institution_link)) - - try: - course_pages_len = int( - driver.find_element( - "id", "lblInstWithEQPaginationInfo" - ).text.split()[-1] - ) - except NoSuchElementException: - course_pages_len = 1 - - try: - courses = institutions[inst_name]["courses"] - except: - courses = [] - - while state["course_pg"] <= course_pages_len: - course_links_len = len( - driver.find_element("id", "gdvCourseEQ").find_elements( - By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" - ) - ) - while state["course_idx"] < course_links_len: - course_link = driver.find_element( - "id", "gdvCourseEQ" - ).find_elements( - By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" - )[ - state["course_idx"] - ] - course_link.click() - - try: - wait( - EC.element_to_be_clickable( - (By.CSS_SELECTOR, ".modal-header button") - ) + if state["inst_pg"] != 1: + while curr_page != state["inst_pg"]: + print(f"Jumping to institution page {curr_page}", file=sys.stderr) + jumpable_pages = { + int(x.get_attribute("href").split("'")[3][5:]): x + for x in driver.find_elements( + By.CSS_SELECTOR, + """a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""", ) - - transfer = [ - scrape_course_card("lblSendCourseEQDetail", i, False) - for i in range( - 0, - len( - driver.find_element( - "id", "lblSendCourseEQDetail" - ).find_elements(By.CSS_SELECTOR, ".course-detail") - ), - ) - ] - - rpi = [ - scrape_course_card("lblReceiveCourseEQDetail", i, True) - for i in range( - 0, - len( - driver.find_element( - "id", "lblReceiveCourseEQDetail" - ).find_elements(By.CSS_SELECTOR, ".course-detail") - ), - ) - ] - - print( - f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})", - file=sys.stderr, - ) - - begin_date = driver.find_element( - "id", "lblBeginEffectiveDate" - ).text - end_date = driver.find_element("id", "lblEndEffectiveDate").text - + } + curr_page = int( driver.find_element( - By.CSS_SELECTOR, ".modal-header button" - ).click() + "id", "lblInstWithEQPaginationInfo" + ).text.split()[-3] + ) + if state["inst_pg"] in jumpable_pages: + jumpable_pages[state["inst_pg"]].click() + curr_page = state["inst_pg"] + elif state["inst_pg"] < min(jumpable_pages): + jumpable_pages[min(jumpable_pages)].click() + curr_page = min(jumpable_pages) + else: + jumpable_pages[max(jumpable_pages)].click() + curr_page = max(jumpable_pages) - courses += [ - { - "transfer": transfer, - "rpi": rpi, - "begin": begin_date, - "end": end_date, - } - ] - state["course_idx"] += 1 - except Exception as e: - institutions.update( - { - inst_name: { - "city": city, - "state": us_state, - "courses": courses, - } - } - ) - raise e - state["course_idx"] = 0 - state["course_pg"] += 1 - institutions.update( - {inst_name: {"city": city, "state": us_state, "courses": courses}} - ) - state["course_pg"] = 1 - state["inst_idx"] += 1 + wait(EC.staleness_of(page)) + sleep(random.uniform(3, 6)) + page = driver.find_element("id", f"gdvInstWithEQ") - driver.find_element("id", "btnSwitchView").click() - wait( - EC.text_to_be_present_in_element( - ("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"]) + inst_list_len = len( + page.find_elements( + By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" ) ) - state["inst_idx"] = 0 - state["inst_pg"] = (state["inst_pg"] % num_pages) + 1 + while state["inst_idx"] < inst_list_len: + institution_link = driver.find_element( + "id", "gdvInstWithEQ" + ).find_elements( + By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" + )[ + state["inst_idx"] + ] + fields = institution_link.find_element(By.XPATH, "../..").find_elements( + By.CSS_SELECTOR, ".gdv_boundfield_uppercase" + ) + inst_name = institution_link.text.title().strip() + city = fields[0].text.title().strip() + us_state = fields[1].text.strip() -except (Exception, KeyboardInterrupt) as e: - print("Program hits exception and will save and terminate", file=sys.stderr) - print(traceback.format_exc(), file=sys.stderr) + institution_link.click() + wait(EC.staleness_of(institution_link)) -print("Program will terminate with state: ", end="", file=sys.stderr) -json.dump(state, sys.stderr, indent=4) -print("", file=sys.stderr) -with open(transfer_json_path, "w") as transferjson: - json.dump(institutions, transferjson, indent=4) -with open(state_json_path, "w") as statejson: - json.dump(state, statejson, indent=4) -driver.quit() + try: + course_pages_len = int( + driver.find_element( + "id", "lblInstWithEQPaginationInfo" + ).text.split()[-1] + ) + except NoSuchElementException: + course_pages_len = 1 + + try: + courses = institutions[inst_name]["courses"] + except: + courses = [] + + while state["course_pg"] <= course_pages_len: + course_links_len = len( + driver.find_element("id", "gdvCourseEQ").find_elements( + By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" + ) + ) + while state["course_idx"] < course_links_len: + course_link = driver.find_element( + "id", "gdvCourseEQ" + ).find_elements( + By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" + )[ + state["course_idx"] + ] + course_link.click() + + try: + wait( + EC.element_to_be_clickable( + (By.CSS_SELECTOR, ".modal-header button") + ) + ) + + transfer = [ + scrape_course_card("lblSendCourseEQDetail", i, False) + for i in range( + 0, + len( + driver.find_element( + "id", "lblSendCourseEQDetail" + ).find_elements( + By.CSS_SELECTOR, ".course-detail" + ) + ), + ) + ] + + rpi = [ + scrape_course_card("lblReceiveCourseEQDetail", i, True) + for i in range( + 0, + len( + driver.find_element( + "id", "lblReceiveCourseEQDetail" + ).find_elements( + By.CSS_SELECTOR, ".course-detail" + ) + ), + ) + ] + + print( + f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})", + file=sys.stderr, + ) + + begin_date = driver.find_element( + "id", "lblBeginEffectiveDate" + ).text + end_date = driver.find_element( + "id", "lblEndEffectiveDate" + ).text + + driver.find_element( + By.CSS_SELECTOR, ".modal-header button" + ).click() + + courses += [ + { + "transfer": transfer, + "rpi": rpi, + "begin": begin_date, + "end": end_date, + } + ] + state["course_idx"] += 1 + except Exception as e: + institutions.update( + { + inst_name: { + "city": city, + "state": us_state, + "courses": courses, + } + } + ) + raise e + state["course_idx"] = 0 + state["course_pg"] += 1 + institutions.update( + {inst_name: {"city": city, "state": us_state, "courses": courses}} + ) + state["course_pg"] = 1 + state["inst_idx"] += 1 + + driver.find_element("id", "btnSwitchView").click() + wait( + EC.text_to_be_present_in_element( + ("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"]) + ) + ) + state["inst_idx"] = 0 + state["inst_pg"] = (state["inst_pg"] % num_pages) + 1 + + except (Exception, KeyboardInterrupt) as e: + print("Program hits exception and will save and terminate", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + + print("Program will terminate with state: ", end="", file=sys.stderr) + json.dump(state, sys.stderr, indent=4) + print("", file=sys.stderr) + with open(transfer_json_path, "w") as transferjson: + json.dump(institutions, transferjson, indent=4) + with open(state_json_path, "w") as statejson: + json.dump(state, statejson, indent=4) + driver.quit() + + +if __name__ == "__main__": + main()