Redesign scraper to not be unbearably slow

This commit is contained in:
powe97 2024-03-05 18:33:54 -05:00
parent 976b553b14
commit 6ad6f85708
No known key found for this signature in database
GPG key ID: 7D1663B10978D1BA

View file

@ -3,22 +3,17 @@ import html
import sys import sys
import re import re
import os.path import os.path
import traceback
from time import sleep from time import sleep
import random import random
from signal import alarm, SIGALRM, signal
from fake_useragent import UserAgent from fake_useragent import UserAgent
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException from selenium.common.exceptions import (
from selenium.common.exceptions import TimeoutException StaleElementReferenceException,
from selenium.common.exceptions import NoSuchElementException NoSuchElementException,
)
def raise_(ex):
raise ex
# Fix course titles accounting for Roman numerals up to X # Fix course titles accounting for Roman numerals up to X
@ -29,84 +24,34 @@ def normalize_title(input):
return s.strip() return s.strip()
# Waits until EC plus some random wait time
def wait(ec): def wait(ec):
global driver global driver
WebDriverWait( WebDriverWait(
driver, 20, ignored_exceptions=[StaleElementReferenceException] driver, 60, ignored_exceptions=[StaleElementReferenceException]
).until(ec) ).until(ec)
sleep(random.uniform(400, 1900) / 1000) sleep(random.uniform(400, 1900) / 1000)
def scrape_course_card(html_id, i, note): # jump_to_page: navigates to a paginated page on this insufferable website
#
# curr_page: the current page number
# to_page: the page number to jump to
# num_pages: the total number of pages
# postback_type: javascript:__doPostBack('<this field>','Page$3')
# pagination_type: <span id="<this field>">PAGE 1 OF 27<br></span>
def jump_to_page(curr_page, to_page, postback_type, pagination_type):
global driver global driver
trs = ( page = driver.find_element(By.ID, postback_type)
driver.find_element("id", html_id)
.find_elements(By.CSS_SELECTOR, ".course-detail")[i]
.find_elements(By.TAG_NAME, "tr")
)
course_name_and_id = trs[0].text.split()
course_desc = ""
if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
course_desc = trs[1].text
course_department = normalize_title(
next((x for x in trs if x.text.strip().startswith("Department:")))
.find_elements(By.TAG_NAME, "td")[1]
.text
)
course_catalog = normalize_title(
next((x for x in trs if x.text.strip().startswith("Source catalog:")))
.find_elements(By.TAG_NAME, "td")[1]
.text
)
try: try:
k = 1 + next( num_pages = int(driver.find_element(By.ID, pagination_type).text.split()[-1])
i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v)) except NoSuchElementException:
)
course_id = " ".join(course_name_and_id[0:k])
course_name = normalize_title(" ".join(course_name_and_id[k:]))
except StopIteration: # Handling for Not Transferrable
course_id = course_name_and_id[0]
course_name = normalize_title(" ".join(course_name_and_id[1:]))
if not note:
try:
course_credits = (
next((x for x in trs if x.text.strip().startswith("Units:")))
.find_elements(By.TAG_NAME, "td")[1]
.text.strip()
)
except:
course_credits = ""
return {
"id": course_id,
"name": course_name,
"credits": course_credits,
"desc": course_desc,
"department": course_department,
"catalog": course_catalog,
}
else:
course_note = driver.find_element("id", "lblCommentsPublic").text.strip()
return {
"id": course_id,
"name": course_name,
"note": course_note,
"desc": course_desc,
"department": course_department,
"catalog": course_catalog,
}
def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
page = driver.find_element("id", postback_type)
if num_pages == 1:
return 1, page return 1, page
if to_page > num_pages or to_page < 1:
raise ValueError(f"to_page was out of range ({to_page} not in [1, {num_pages})")
while curr_page != to_page: while curr_page != to_page:
jumpable_pages = { jumpable_pages = {
int(x.get_attribute("href").split("'")[3][5:]): x int(x.get_attribute("href").split("'")[3][5:]): x
@ -117,7 +62,7 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
+ """','Page$"]""", + """','Page$"]""",
) )
} }
curr_page = int(driver.find_element("id", pagination_type).text.split()[-3]) curr_page = int(driver.find_element(By.ID, pagination_type).text.split()[-3])
if to_page in jumpable_pages: if to_page in jumpable_pages:
jumpable_pages[to_page].click() jumpable_pages[to_page].click()
curr_page = to_page curr_page = to_page
@ -131,247 +76,166 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
wait(EC.staleness_of(page)) wait(EC.staleness_of(page))
sleep(random.uniform(400, 1900) / 1000) sleep(random.uniform(400, 1900) / 1000)
page = driver.find_element("id", postback_type) page = driver.find_element(By.ID, postback_type)
return curr_page, page return curr_page, page
# scrape_page: Scrapes a page of institutions
#
# page_num: The page to scrape.
# Note that the current page before running this function must be 1.
def scrape_page(page_num):
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
num_institutions = len(
driver.find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)
)
print(f"Scraping page {page_num}, found {num_institutions} links")
return [scrape_institution(i) for i in range(0, num_institutions)]
# scrape_institution: Scrapes an institution by index.
#
# index: the 0-indexed index of the instituion to scrape on the page we are on.
def scrape_institution(index):
# Go to institution page
inst_link = driver.find_element(
By.ID, f"gdvInstWithEQ_btnCreditFromInstName_{index}"
)
[inst_name, inst_city, inst_state, _] = [
e.text
for e in inst_link.find_element(By.XPATH, "../..").find_elements(
By.TAG_NAME, "td"
)
]
inst_name, inst_city = normalize_title(inst_name), normalize_title(inst_city)
inst_link.click()
wait(EC.staleness_of(inst_link))
print(f"Scraping {inst_name} ({inst_city}, {inst_state})")
# Add all courses
try:
num_pages = int(
driver.find_element(By.ID, "lblCourseEQPaginationInfo").text.split()[-1]
)
except NoSuchElementException:
num_pages = 1
for i in range(1, num_pages + 1):
jump_to_page(max(1, i - 1), i, "gdvCourseEQ", "lblCourseEQPaginationInfo")
driver.find_element(By.ID, "gdvCourseEQ_cbxHeaderCheckAll").click()
# Open list
driver.find_element(By.ID, "btnAddToMyEQList").click()
wait(EC.visibility_of_element_located((By.ID, "gdvMyCourseEQList")))
# Scrape list
tds = driver.find_element(By.ID, "gdvMyCourseEQList").find_elements(
By.TAG_NAME, "td"
)
transfer_courses = [
{
"transfer": parse_course_td(transfer_course),
"rpi": parse_course_td(rpi_course, note.text.strip()),
"begin": begin.text.strip(),
"end": end.text.strip(),
}
for transfer_course, rpi_course, note, begin, end, _ in zip(
*[iter(x for x in tds)] * 6
)
]
# Clear list
tr = driver.find_element(By.ID, "gdvMyCourseEQList").find_element(By.TAG_NAME, "tr")
driver.find_element(By.ID, "btnClearMyList").click()
wait(EC.staleness_of(tr))
# Exit My List menu
driver.find_element(By.CSS_SELECTOR, "#udpAddCourseEQToMyList button.close").click()
# Leave institution page
switch_view = driver.find_element(By.ID, "btnSwitchView")
switch_view.click()
wait(EC.staleness_of(switch_view))
return {
"institution": inst_name,
"city": inst_city,
"state": inst_state,
"courses": transfer_courses,
}
def parse_course_td(td, note=None):
course_info = (
html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0].split()
)
# Not all schools use the same course code format, so this figures out how long
# it is if it exists, it will not exist for Not Transferrable.
try:
course_id_delim = 1 + list(
bool(re.search(r"\d", s)) for s in course_info
).index(True)
except ValueError:
course_id_delim = 1
# Same deal with credit counts.
try:
cr_delim = (
len(course_info)
- 1
- list(bool(re.search(r"\(", s)) for s in course_info[::-1]).index(True)
)
except ValueError:
cr_delim = len(course_info)
# note serves as a credit count override, since the RPI-side credit counts
# are inaccurate
out = {
"id": " ".join(course_info[:course_id_delim]),
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
"catalog": td.find_element(By.TAG_NAME, "span").text,
}
if note is None:
out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1])}),
return out
else:
out.update({"note": note})
return out
def main(): def main():
global driver global driver
if len(sys.argv) != 3 and len(sys.argv) != 4: if len(sys.argv) != 3:
print( print(f"USAGE: python {sys.argv[0]} <page number to scrape> <output file>")
f"USAGE: python {sys.argv[0]} <transfer file> <state file> [timeout minutes]" return 1
)
exit(1)
transfer_json_path = sys.argv[1] PAGE_NUM_TO_SCRAPE = int(sys.argv[1])
state_json_path = sys.argv[2] OUT_FILENAME = sys.argv[2]
timeout_seconds = int(sys.argv[3] if len(sys.argv) == 4 else 120) * 60
# Set up timeout so that the GH action does not run forever, pretend it's ^C
print(f"Setting timeout to {timeout_seconds} seconds", file=sys.stderr)
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
alarm(timeout_seconds)
print(f"Setting up selenium Firefox emulator")
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.add_argument("--headless") options.add_argument("--headless")
user_agent = UserAgent().random user_agent = UserAgent().random
options.set_preference("general.useragent.override", user_agent) options.set_preference("general.useragent.override", user_agent)
# options.set_preference("network.proxy.socks", "")
# options.set_preference("network.proxy.socks_port", )
# options.set_preference("network.proxy.socks_remote_dns", True)
# options.set_preference("network.proxy.type", 1)
print(f"Using randomized user agent {user_agent}", file=sys.stderr) print(f"Using randomized user agent {user_agent}", file=sys.stderr)
driver = webdriver.Firefox(options=options) driver = webdriver.Firefox(options=options)
print(f"Connecting to the TES Public View")
driver.get( driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
) )
print( with open("transfer.json", "w") as transferjson:
f'Title is {driver.find_element(By.TAG_NAME, "title").get_attribute("innerText").strip()}', json.dump(scrape_page(PAGE_NUM_TO_SCRAPE), transferjson, indent=4)
file=sys.stderr,
)
num_pages = int(
driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1]
)
print(f"{num_pages} pages detected", file=sys.stderr)
state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0}
institutions = {}
if os.path.isfile(state_json_path):
with open(state_json_path, "r") as statejson:
state = json.load(statejson)
if os.path.isfile(transfer_json_path):
with open(transfer_json_path, "r") as transferjson:
institutions = json.load(transferjson)
print("Loaded state: ", end="", file=sys.stderr)
json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr)
if state["inst_pg"] > num_pages:
raise Exception
try:
curr_inst_page = 1
while state["inst_pg"] <= num_pages:
curr_inst_page, page = jump_to_page(
curr_inst_page,
state["inst_pg"],
num_pages,
"gdvInstWithEQ",
"lblInstWithEQPaginationInfo",
)
inst_list_len = len(
page.find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)
)
while state["inst_idx"] < inst_list_len:
institution_link = driver.find_element(
"id", "gdvInstWithEQ"
).find_elements(
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)[
state["inst_idx"]
]
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
)
inst_name = normalize_title(institution_link.text)
city = normalize_title(fields[0].text)
us_state = fields[1].text.strip()
institution_link.click()
wait(EC.staleness_of(institution_link))
try:
course_pages_len = int(
driver.find_element(
"id", "lblCourseEQPaginationInfo"
).text.split()[-1]
)
except NoSuchElementException:
course_pages_len = 1
try:
courses = institutions[inst_name]["courses"]
except Exception:
courses = []
curr_course_page = 1
while state["course_pg"] <= course_pages_len:
curr_course_page, page = jump_to_page(
curr_course_page,
state["course_pg"],
course_pages_len,
"gdvCourseEQ",
"lblCourseEQPaginationInfo",
)
course_links_len = len(
page.find_elements(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
)
)
while state["course_idx"] < course_links_len:
course_link = driver.find_element(
"id", "gdvCourseEQ"
).find_elements(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
)[
state["course_idx"]
]
course_link.click()
try:
wait(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".modal-header button")
),
)
transfer = [
scrape_course_card("lblSendCourseEQDetail", i, False)
for i in range(
0,
len(
driver.find_element(
"id", "lblSendCourseEQDetail"
).find_elements(
By.CSS_SELECTOR, ".course-detail"
)
),
)
]
rpi = [
scrape_course_card("lblReceiveCourseEQDetail", i, True)
for i in range(
0,
len(
driver.find_element(
"id", "lblReceiveCourseEQDetail"
).find_elements(
By.CSS_SELECTOR, ".course-detail"
)
),
)
]
print(
f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})",
file=sys.stderr,
)
begin_date = driver.find_element(
"id", "lblBeginEffectiveDate"
).text
end_date = driver.find_element(
"id", "lblEndEffectiveDate"
).text
driver.find_element(
By.CSS_SELECTOR, ".modal-header button"
).click()
courses += [
{
"transfer": transfer,
"rpi": rpi,
"begin": begin_date,
"end": end_date,
}
]
state["course_idx"] += 1
except (Exception, KeyboardInterrupt) as e:
institutions.update(
{
inst_name: {
"city": city,
"state": us_state,
"courses": courses,
}
}
)
raise e
state["course_idx"] = 0
state["course_pg"] += 1
institutions.update(
{inst_name: {"city": city, "state": us_state, "courses": courses}}
)
state["course_pg"] = 1
state["inst_idx"] += 1
driver.find_element("id", "btnSwitchView").click()
wait(
EC.text_to_be_present_in_element(
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
),
)
state["inst_idx"] = 0
state["inst_pg"] += 1
except (Exception, KeyboardInterrupt) as e:
print("Program hits exception and will save and terminate", file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
print("Program will terminate with state: ", end="", file=sys.stderr)
json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr)
with open(transfer_json_path, "w") as transferjson:
json.dump(institutions, transferjson, indent=4)
with open(state_json_path, "w") as statejson:
json.dump(state, statejson, indent=4)
driver.quit() driver.quit()
if __name__ == "__main__": if __name__ == "__main__":
main() exit(main())