2024-02-29 21:51:07 +00:00
|
|
|
|
import json
|
|
|
|
|
import html
|
|
|
|
|
import sys
|
|
|
|
|
import re
|
|
|
|
|
import os.path
|
|
|
|
|
from time import sleep
|
|
|
|
|
import random
|
|
|
|
|
from fake_useragent import UserAgent
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
2024-03-05 23:33:54 +00:00
|
|
|
|
from selenium.common.exceptions import (
|
|
|
|
|
StaleElementReferenceException,
|
|
|
|
|
NoSuchElementException,
|
|
|
|
|
)
|
2024-02-29 21:51:07 +00:00
|
|
|
|
|
|
|
|
|
|
2024-03-01 18:32:02 +00:00
|
|
|
|
# Fix course titles accounting for Roman numerals up to X
|
|
|
|
|
def normalize_title(input):
|
|
|
|
|
s = " ".join(input.split())
|
2024-03-01 22:21:45 +00:00
|
|
|
|
s = re.sub(r"[A-Za-z]+(['‘’][A-Za-z]+)?", lambda m: m.group(0).capitalize(), s)
|
2024-03-01 18:32:02 +00:00
|
|
|
|
s = re.sub(r"\b(Viii|Vii|Vi|Iv|Ix|Iii|Ii)\b", lambda a: a.group(0).upper(), s)
|
|
|
|
|
return s.strip()
|
2024-02-29 21:51:07 +00:00
|
|
|
|
|
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
# Waits until EC plus some random wait time
|
2024-03-01 02:21:39 +00:00
|
|
|
|
def wait(ec):
|
|
|
|
|
global driver
|
|
|
|
|
|
2024-02-29 21:51:07 +00:00
|
|
|
|
WebDriverWait(
|
2024-03-06 19:20:09 +00:00
|
|
|
|
driver, 35, ignored_exceptions=[StaleElementReferenceException]
|
2024-02-29 21:51:07 +00:00
|
|
|
|
).until(ec)
|
|
|
|
|
sleep(random.uniform(400, 1900) / 1000)
|
|
|
|
|
|
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
# jump_to_page: navigates to a paginated page on this insufferable website
|
|
|
|
|
#
|
|
|
|
|
# curr_page: the current page number
|
|
|
|
|
# to_page: the page number to jump to
|
|
|
|
|
# num_pages: the total number of pages
|
|
|
|
|
# postback_type: javascript:__doPostBack('<this field>','Page$3')
|
|
|
|
|
# pagination_type: <span id="<this field>">PAGE 1 OF 27<br></span>
|
|
|
|
|
def jump_to_page(curr_page, to_page, postback_type, pagination_type):
|
2024-03-01 02:21:39 +00:00
|
|
|
|
global driver
|
|
|
|
|
|
2024-03-06 19:20:09 +00:00
|
|
|
|
wait(EC.visibility_of_element_located((By.ID, postback_type)))
|
2024-03-05 23:33:54 +00:00
|
|
|
|
page = driver.find_element(By.ID, postback_type)
|
2024-02-29 21:51:07 +00:00
|
|
|
|
try:
|
2024-03-05 23:33:54 +00:00
|
|
|
|
num_pages = int(driver.find_element(By.ID, pagination_type).text.split()[-1])
|
|
|
|
|
except NoSuchElementException:
|
2024-03-02 01:10:56 +00:00
|
|
|
|
return 1, page
|
2024-03-05 23:33:54 +00:00
|
|
|
|
|
|
|
|
|
if to_page > num_pages or to_page < 1:
|
2024-03-06 02:14:00 +00:00
|
|
|
|
raise ValueError(
|
|
|
|
|
f"to_page was out of range ({to_page} not in [1, {num_pages}])"
|
|
|
|
|
)
|
2024-03-02 01:10:56 +00:00
|
|
|
|
while curr_page != to_page:
|
|
|
|
|
jumpable_pages = {
|
|
|
|
|
int(x.get_attribute("href").split("'")[3][5:]): x
|
|
|
|
|
for x in driver.find_elements(
|
|
|
|
|
By.CSS_SELECTOR,
|
|
|
|
|
"""a[href^="javascript:__doPostBack('"""
|
|
|
|
|
+ postback_type
|
|
|
|
|
+ """','Page$"]""",
|
|
|
|
|
)
|
|
|
|
|
}
|
2024-03-05 23:33:54 +00:00
|
|
|
|
curr_page = int(driver.find_element(By.ID, pagination_type).text.split()[-3])
|
2024-03-02 01:10:56 +00:00
|
|
|
|
if to_page in jumpable_pages:
|
|
|
|
|
jumpable_pages[to_page].click()
|
|
|
|
|
curr_page = to_page
|
|
|
|
|
elif to_page < min(jumpable_pages):
|
|
|
|
|
jumpable_pages[min(jumpable_pages)].click()
|
|
|
|
|
curr_page = min(jumpable_pages)
|
|
|
|
|
else:
|
|
|
|
|
jumpable_pages[max(jumpable_pages)].click()
|
|
|
|
|
curr_page = max(jumpable_pages)
|
|
|
|
|
|
|
|
|
|
wait(EC.staleness_of(page))
|
2024-03-04 22:03:13 +00:00
|
|
|
|
sleep(random.uniform(400, 1900) / 1000)
|
2024-03-05 23:33:54 +00:00
|
|
|
|
page = driver.find_element(By.ID, postback_type)
|
2024-03-02 01:10:56 +00:00
|
|
|
|
return curr_page, page
|
|
|
|
|
|
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
# scrape_page: Scrapes a page of institutions
|
|
|
|
|
#
|
|
|
|
|
# page_num: The page to scrape.
|
|
|
|
|
# Note that the current page before running this function must be 1.
|
|
|
|
|
def scrape_page(page_num):
|
2024-03-06 02:14:00 +00:00
|
|
|
|
global driver
|
|
|
|
|
global options
|
|
|
|
|
|
2024-03-07 22:14:54 +00:00
|
|
|
|
for i in range(1, 4):
|
2024-03-06 07:17:15 +00:00
|
|
|
|
try:
|
|
|
|
|
driver = webdriver.Firefox(options=options)
|
2024-03-14 02:18:13 +00:00
|
|
|
|
driver.get("https://ipinfo.io/ip")
|
2024-03-14 02:19:56 +00:00
|
|
|
|
print(f"Trying with IP {driver.page_source}", file=sys.stderr)
|
2024-03-06 07:17:15 +00:00
|
|
|
|
driver.get(
|
|
|
|
|
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
|
|
|
|
)
|
2024-03-07 22:14:54 +00:00
|
|
|
|
wait(EC.visibility_of_element_located((By.TAG_NAME, "body")))
|
|
|
|
|
print(f'Title: "{driver.title}"', file=sys.stderr)
|
2024-03-06 07:17:15 +00:00
|
|
|
|
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
|
|
|
|
|
break
|
|
|
|
|
except Exception as e:
|
2024-03-06 08:35:18 +00:00
|
|
|
|
driver.quit()
|
2024-03-14 02:18:13 +00:00
|
|
|
|
|
2024-03-06 08:09:42 +00:00
|
|
|
|
print(
|
2024-03-06 18:48:48 +00:00
|
|
|
|
f"Attempt {i} failed due to {type(e).__name__}, retrying in 25 seconds...",
|
2024-03-06 08:09:42 +00:00
|
|
|
|
file=sys.stderr,
|
|
|
|
|
)
|
2024-03-06 07:17:15 +00:00
|
|
|
|
sleep(25)
|
2024-03-06 08:35:18 +00:00
|
|
|
|
else:
|
2024-03-14 02:18:13 +00:00
|
|
|
|
raise Exception(f"Failed to load the main page after 4 attempts, aborting.")
|
2024-03-06 02:14:00 +00:00
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
num_institutions = len(
|
|
|
|
|
driver.find_elements(
|
|
|
|
|
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
|
|
|
|
)
|
|
|
|
|
)
|
2024-03-06 02:14:00 +00:00
|
|
|
|
driver.quit()
|
|
|
|
|
|
2024-03-06 00:03:54 +00:00
|
|
|
|
print(f"Scraping page {page_num}, found {num_institutions} links", file=sys.stderr)
|
2024-03-06 07:01:29 +00:00
|
|
|
|
return [scrape_institution_safe(i, page_num) for i in range(0, num_institutions)]
|
2024-03-05 23:33:54 +00:00
|
|
|
|
|
|
|
|
|
|
2024-03-06 03:54:42 +00:00
|
|
|
|
def scrape_institution_safe(index, page_num):
|
2024-03-07 22:14:54 +00:00
|
|
|
|
for i in range(1, 4):
|
2024-03-06 03:54:42 +00:00
|
|
|
|
try:
|
|
|
|
|
return scrape_institution(index, page_num)
|
2024-03-06 07:01:29 +00:00
|
|
|
|
except Exception as e:
|
2024-03-06 08:35:18 +00:00
|
|
|
|
driver.quit()
|
2024-03-06 07:01:29 +00:00
|
|
|
|
print(
|
2024-03-06 18:48:48 +00:00
|
|
|
|
f"\tAttempt {i} failed due to {type(e).__name__}, retrying in 25 seconds...",
|
2024-03-06 08:09:42 +00:00
|
|
|
|
file=sys.stderr,
|
2024-03-06 07:01:29 +00:00
|
|
|
|
)
|
2024-03-06 07:17:15 +00:00
|
|
|
|
sleep(25)
|
2024-03-06 08:35:18 +00:00
|
|
|
|
else:
|
2024-03-14 02:18:13 +00:00
|
|
|
|
raise Exception(f"Failed to scrape {index} after 4 attempts, aborting.")
|
2024-03-06 03:54:42 +00:00
|
|
|
|
|
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
# scrape_institution: Scrapes an institution by index.
|
|
|
|
|
#
|
|
|
|
|
# index: the 0-indexed index of the instituion to scrape on the page we are on.
|
2024-03-06 02:14:00 +00:00
|
|
|
|
def scrape_institution(index, page_num):
|
|
|
|
|
global driver
|
|
|
|
|
global options
|
|
|
|
|
|
|
|
|
|
driver = webdriver.Firefox(options=options)
|
|
|
|
|
driver.get(
|
|
|
|
|
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
|
|
|
|
)
|
|
|
|
|
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
|
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
inst_link = driver.find_element(
|
|
|
|
|
By.ID, f"gdvInstWithEQ_btnCreditFromInstName_{index}"
|
|
|
|
|
)
|
|
|
|
|
[inst_name, inst_city, inst_state, _] = [
|
|
|
|
|
e.text
|
|
|
|
|
for e in inst_link.find_element(By.XPATH, "../..").find_elements(
|
|
|
|
|
By.TAG_NAME, "td"
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
inst_name, inst_city = normalize_title(inst_name), normalize_title(inst_city)
|
|
|
|
|
inst_link.click()
|
|
|
|
|
wait(EC.staleness_of(inst_link))
|
2024-03-06 00:03:54 +00:00
|
|
|
|
print(f"Scraping {inst_name} ({inst_city}, {inst_state})", file=sys.stderr)
|
2024-03-05 23:33:54 +00:00
|
|
|
|
|
|
|
|
|
# Add all courses
|
|
|
|
|
try:
|
|
|
|
|
num_pages = int(
|
|
|
|
|
driver.find_element(By.ID, "lblCourseEQPaginationInfo").text.split()[-1]
|
|
|
|
|
)
|
|
|
|
|
except NoSuchElementException:
|
|
|
|
|
num_pages = 1
|
2024-03-06 00:03:54 +00:00
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
for i in range(1, num_pages + 1):
|
|
|
|
|
jump_to_page(max(1, i - 1), i, "gdvCourseEQ", "lblCourseEQPaginationInfo")
|
|
|
|
|
driver.find_element(By.ID, "gdvCourseEQ_cbxHeaderCheckAll").click()
|
|
|
|
|
except NoSuchElementException:
|
|
|
|
|
# Institution has no data
|
|
|
|
|
return {
|
|
|
|
|
"institution": inst_name,
|
|
|
|
|
"city": inst_city,
|
|
|
|
|
"state": inst_state,
|
|
|
|
|
"courses": [],
|
|
|
|
|
}
|
2024-03-05 23:33:54 +00:00
|
|
|
|
|
|
|
|
|
# Open list
|
|
|
|
|
driver.find_element(By.ID, "btnAddToMyEQList").click()
|
|
|
|
|
wait(EC.visibility_of_element_located((By.ID, "gdvMyCourseEQList")))
|
|
|
|
|
|
|
|
|
|
# Scrape list
|
|
|
|
|
tds = driver.find_element(By.ID, "gdvMyCourseEQList").find_elements(
|
|
|
|
|
By.TAG_NAME, "td"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
transfer_courses = [
|
|
|
|
|
{
|
2024-03-13 19:48:53 +00:00
|
|
|
|
"transfer": parse_course_td(transfer_course, True),
|
|
|
|
|
"rpi": parse_course_td(rpi_course, False),
|
|
|
|
|
"note": note.text.strip(),
|
2024-03-05 23:33:54 +00:00
|
|
|
|
"begin": begin.text.strip(),
|
|
|
|
|
"end": end.text.strip(),
|
|
|
|
|
}
|
|
|
|
|
for transfer_course, rpi_course, note, begin, end, _ in zip(
|
|
|
|
|
*[iter(x for x in tds)] * 6
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
2024-03-06 02:14:00 +00:00
|
|
|
|
driver.quit()
|
2024-03-05 23:33:54 +00:00
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"institution": inst_name,
|
|
|
|
|
"city": inst_city,
|
|
|
|
|
"state": inst_state,
|
|
|
|
|
"courses": transfer_courses,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2024-03-13 19:48:53 +00:00
|
|
|
|
# Scrape course entries. We have a switch to disable including credit counts because the
|
|
|
|
|
# RPI-side credit counts are wrong most of the time and this is clarified in notes.
|
|
|
|
|
def parse_course_td(td, include_credits):
|
2024-03-07 17:03:59 +00:00
|
|
|
|
# This regex removes spaces next to parentheses. For example,
|
|
|
|
|
# Calculus II ( 04) -> Calculus II (04)
|
2024-03-13 19:48:53 +00:00
|
|
|
|
td_text = html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")
|
|
|
|
|
courses_info = [
|
|
|
|
|
re.sub(
|
|
|
|
|
"(?<=[\[{(])\s+|\s+(?=[\]})])",
|
|
|
|
|
"",
|
|
|
|
|
x,
|
|
|
|
|
).split()
|
|
|
|
|
for x in td_text[: len(td_text) - 3]
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
return [parse_one_course(x, include_credits) for x in courses_info]
|
2024-03-05 23:33:54 +00:00
|
|
|
|
|
2024-03-13 19:48:53 +00:00
|
|
|
|
|
|
|
|
|
def parse_one_course(course_info, include_credits):
|
2024-03-05 23:33:54 +00:00
|
|
|
|
# Not all schools use the same course code format, so this figures out how long
|
2024-03-07 17:03:59 +00:00
|
|
|
|
# it is if it exists. It will not exist for Not Transferrable and AP tests.
|
2024-03-05 23:33:54 +00:00
|
|
|
|
try:
|
|
|
|
|
course_id_delim = 1 + list(
|
|
|
|
|
bool(re.search(r"\d", s)) for s in course_info
|
|
|
|
|
).index(True)
|
|
|
|
|
except ValueError:
|
|
|
|
|
course_id_delim = 1
|
|
|
|
|
|
2024-03-07 17:03:59 +00:00
|
|
|
|
# Same deal with credit counts. Fancy logic here to avoid catching course titles
|
|
|
|
|
# with parentheses in them which do not have a credit count, this happened 3 times
|
|
|
|
|
# This also ignores credit counts with "Variable" in them, but ... you try
|
2024-03-05 23:33:54 +00:00
|
|
|
|
try:
|
2024-03-07 17:03:59 +00:00
|
|
|
|
if course_info[-1] == "()":
|
|
|
|
|
cr_delim = -1
|
|
|
|
|
else:
|
|
|
|
|
cr_delim = (
|
|
|
|
|
len(course_info)
|
|
|
|
|
- 1
|
|
|
|
|
- list(
|
|
|
|
|
bool(re.search(r"^\([.]*[0-9]", s.strip()))
|
|
|
|
|
for s in course_info[::-1]
|
|
|
|
|
).index(True)
|
|
|
|
|
)
|
|
|
|
|
assert bool(re.search(r"[0-9]\)", course_info[-1]))
|
2024-03-06 22:13:13 +00:00
|
|
|
|
except (ValueError, AssertionError):
|
2024-03-05 23:33:54 +00:00
|
|
|
|
cr_delim = len(course_info)
|
|
|
|
|
|
|
|
|
|
# note serves as a credit count override, since the RPI-side credit counts
|
2024-03-07 17:03:59 +00:00
|
|
|
|
# are inaccurate.
|
2024-03-05 23:33:54 +00:00
|
|
|
|
out = {
|
|
|
|
|
"id": " ".join(course_info[:course_id_delim]),
|
|
|
|
|
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
|
|
|
|
|
}
|
2024-03-13 19:48:53 +00:00
|
|
|
|
if include_credits:
|
2024-03-06 22:13:13 +00:00
|
|
|
|
out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1]).strip()}),
|
2024-03-13 19:48:53 +00:00
|
|
|
|
return out
|
2024-03-01 01:49:45 +00:00
|
|
|
|
|
2024-03-01 02:10:19 +00:00
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
def main():
|
|
|
|
|
global driver
|
2024-03-06 02:14:00 +00:00
|
|
|
|
global options
|
2024-03-01 01:49:45 +00:00
|
|
|
|
|
2024-03-05 23:33:54 +00:00
|
|
|
|
if len(sys.argv) != 3:
|
2024-03-06 02:14:00 +00:00
|
|
|
|
print(
|
|
|
|
|
f"USAGE: python {sys.argv[0]} <page number to scrape> <output file>",
|
|
|
|
|
file=sys.stderr,
|
|
|
|
|
)
|
2024-03-05 23:33:54 +00:00
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
PAGE_NUM_TO_SCRAPE = int(sys.argv[1])
|
|
|
|
|
OUT_FILENAME = sys.argv[2]
|
|
|
|
|
|
2024-03-06 00:03:54 +00:00
|
|
|
|
print(f"Setting up selenium Firefox emulator", file=sys.stderr)
|
2024-03-01 01:49:45 +00:00
|
|
|
|
options = webdriver.FirefoxOptions()
|
2024-03-06 02:14:32 +00:00
|
|
|
|
options.add_argument("--headless")
|
2024-03-01 05:29:34 +00:00
|
|
|
|
|
2024-03-01 01:49:45 +00:00
|
|
|
|
user_agent = UserAgent().random
|
|
|
|
|
options.set_preference("general.useragent.override", user_agent)
|
2024-03-01 05:29:34 +00:00
|
|
|
|
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
|
2024-03-01 02:21:39 +00:00
|
|
|
|
|
2024-03-05 23:34:02 +00:00
|
|
|
|
with open(OUT_FILENAME, "w") as transferjson:
|
2024-03-05 23:33:54 +00:00
|
|
|
|
json.dump(scrape_page(PAGE_NUM_TO_SCRAPE), transferjson, indent=4)
|
2024-03-02 01:10:56 +00:00
|
|
|
|
|
2024-03-01 01:49:45 +00:00
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-03-05 23:33:54 +00:00
|
|
|
|
exit(main())
|