mirror of
https://github.com/quatalog/quatalog.git
synced 2024-11-08 08:04:24 +00:00
Add if __name__ == "__main__" and fix workflow
This commit is contained in:
parent
15b09123ee
commit
f216c45748
7
.github/workflows/transfer.yml
vendored
7
.github/workflows/transfer.yml
vendored
|
@ -29,14 +29,17 @@ jobs:
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
working-directory: quatalog-scraping/transfer_scraper
|
working-directory: quatalog-scraping/transfer_scraper
|
||||||
run: pip install -r 'requirements.txt'
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r 'requirements.txt'
|
||||||
|
|
||||||
- name: Log IP
|
- name: Log IP
|
||||||
run: |
|
run: |
|
||||||
echo "Public IP: $(curl -s 'https://ipinfo.io/ip')"
|
echo "Public IP: $(curl -s 'https://ipinfo.io/ip')"
|
||||||
|
|
||||||
- name: Scrape transfer guide
|
- name: Scrape transfer guide
|
||||||
run: python3 quatalog-scraping/transfer_scraper data/transfer.json data/transfer_state.json
|
run: |
|
||||||
|
python3 quatalog-scraping/transfer_scraper/main.py data/transfer.json data/transfer_state.json
|
||||||
|
|
||||||
- name: Upload data to artifact
|
- name: Upload data to artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
|
|
|
@ -100,222 +100,235 @@ def scrape_course_card(html_id, i, note):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if len(sys.argv) != 3:
|
def main():
|
||||||
print(f"USAGE: python {sys.argv[0]} <transfer file> <state file>")
|
if len(sys.argv) != 3:
|
||||||
exit(1)
|
print(f"USAGE: python {sys.argv[0]} <transfer file> <state file>")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
transfer_json_path = sys.argv[1]
|
transfer_json_path = sys.argv[1]
|
||||||
state_json_path = sys.argv[2]
|
state_json_path = sys.argv[2]
|
||||||
|
|
||||||
options = webdriver.FirefoxOptions()
|
options = webdriver.FirefoxOptions()
|
||||||
user_agent = UserAgent().random
|
user_agent = UserAgent().random
|
||||||
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
|
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
|
||||||
if sys.argv[-1] != "gui":
|
if sys.argv[-1] != "gui":
|
||||||
options.add_argument("--headless")
|
options.add_argument("--headless")
|
||||||
options.set_preference("general.useragent.override", user_agent)
|
options.set_preference("general.useragent.override", user_agent)
|
||||||
driver = webdriver.Firefox(options=options)
|
driver = webdriver.Firefox(options=options)
|
||||||
driver.get(
|
driver.get(
|
||||||
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
||||||
)
|
)
|
||||||
|
|
||||||
num_pages = int(
|
num_pages = int(
|
||||||
driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1]
|
driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1]
|
||||||
)
|
)
|
||||||
print(f"{num_pages} pages detected", file=sys.stderr)
|
print(f"{num_pages} pages detected", file=sys.stderr)
|
||||||
|
|
||||||
state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0}
|
state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0}
|
||||||
institutions = {}
|
institutions = {}
|
||||||
if os.path.isfile(state_json_path):
|
if os.path.isfile(state_json_path):
|
||||||
with open(state_json_path, "r") as statejson:
|
with open(state_json_path, "r") as statejson:
|
||||||
state = json.load(statejson)
|
state = json.load(statejson)
|
||||||
if os.path.isfile(transfer_json_path):
|
if os.path.isfile(transfer_json_path):
|
||||||
with open(transfer_json_path, "r") as transferjson:
|
with open(transfer_json_path, "r") as transferjson:
|
||||||
institutions = json.load(transferjson)
|
institutions = json.load(transferjson)
|
||||||
|
|
||||||
print("Loaded state: ", end="", file=sys.stderr)
|
print("Loaded state: ", end="", file=sys.stderr)
|
||||||
json.dump(state, sys.stderr, indent=4)
|
json.dump(state, sys.stderr, indent=4)
|
||||||
print("", file=sys.stderr)
|
print("", file=sys.stderr)
|
||||||
|
|
||||||
|
# Set up 2hr timeout so that the GH action does not run forever, pretend it's ^C
|
||||||
|
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
|
||||||
|
alarm(60 * 60 * 2)
|
||||||
|
|
||||||
# Set up 2hr timeout so that the GH action does not run forever, pretend it's ^C
|
try:
|
||||||
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
|
curr_page = 1
|
||||||
alarm(60 * 60 * 2)
|
while state["inst_pg"] <= num_pages:
|
||||||
|
page = driver.find_element("id", f"gdvInstWithEQ")
|
||||||
|
|
||||||
|
if state["inst_pg"] != 1:
|
||||||
try:
|
while curr_page != state["inst_pg"]:
|
||||||
curr_page = 1
|
print(f"Jumping to institution page {curr_page}", file=sys.stderr)
|
||||||
while state["inst_pg"] <= num_pages:
|
jumpable_pages = {
|
||||||
page = driver.find_element("id", f"gdvInstWithEQ")
|
int(x.get_attribute("href").split("'")[3][5:]): x
|
||||||
|
for x in driver.find_elements(
|
||||||
if state["inst_pg"] != 1:
|
By.CSS_SELECTOR,
|
||||||
while curr_page != state["inst_pg"]:
|
"""a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""",
|
||||||
print(f"Jumping to institution page {curr_page}", file=sys.stderr)
|
|
||||||
jumpable_pages = {
|
|
||||||
int(x.get_attribute("href").split("'")[3][5:]): x
|
|
||||||
for x in driver.find_elements(
|
|
||||||
By.CSS_SELECTOR,
|
|
||||||
"""a[href^="javascript:__doPostBack('gdvInstWithEQ','Page$"]""",
|
|
||||||
)
|
|
||||||
}
|
|
||||||
curr_page = int(
|
|
||||||
driver.find_element(
|
|
||||||
"id", "lblInstWithEQPaginationInfo"
|
|
||||||
).text.split()[-3]
|
|
||||||
)
|
|
||||||
if state["inst_pg"] in jumpable_pages:
|
|
||||||
jumpable_pages[state["inst_pg"]].click()
|
|
||||||
curr_page = state["inst_pg"]
|
|
||||||
elif state["inst_pg"] < min(jumpable_pages):
|
|
||||||
jumpable_pages[min(jumpable_pages)].click()
|
|
||||||
curr_page = min(jumpable_pages)
|
|
||||||
else:
|
|
||||||
jumpable_pages[max(jumpable_pages)].click()
|
|
||||||
curr_page = max(jumpable_pages)
|
|
||||||
|
|
||||||
wait(EC.staleness_of(page))
|
|
||||||
sleep(random.uniform(3, 6))
|
|
||||||
page = driver.find_element("id", f"gdvInstWithEQ")
|
|
||||||
|
|
||||||
inst_list_len = len(
|
|
||||||
page.find_elements(
|
|
||||||
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
while state["inst_idx"] < inst_list_len:
|
|
||||||
institution_link = driver.find_element("id", "gdvInstWithEQ").find_elements(
|
|
||||||
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
|
||||||
)[state["inst_idx"]]
|
|
||||||
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
|
|
||||||
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
|
|
||||||
)
|
|
||||||
inst_name = institution_link.text.title().strip()
|
|
||||||
city = fields[0].text.title().strip()
|
|
||||||
us_state = fields[1].text.strip()
|
|
||||||
|
|
||||||
institution_link.click()
|
|
||||||
wait(EC.staleness_of(institution_link))
|
|
||||||
|
|
||||||
try:
|
|
||||||
course_pages_len = int(
|
|
||||||
driver.find_element(
|
|
||||||
"id", "lblInstWithEQPaginationInfo"
|
|
||||||
).text.split()[-1]
|
|
||||||
)
|
|
||||||
except NoSuchElementException:
|
|
||||||
course_pages_len = 1
|
|
||||||
|
|
||||||
try:
|
|
||||||
courses = institutions[inst_name]["courses"]
|
|
||||||
except:
|
|
||||||
courses = []
|
|
||||||
|
|
||||||
while state["course_pg"] <= course_pages_len:
|
|
||||||
course_links_len = len(
|
|
||||||
driver.find_element("id", "gdvCourseEQ").find_elements(
|
|
||||||
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
while state["course_idx"] < course_links_len:
|
|
||||||
course_link = driver.find_element(
|
|
||||||
"id", "gdvCourseEQ"
|
|
||||||
).find_elements(
|
|
||||||
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
|
|
||||||
)[
|
|
||||||
state["course_idx"]
|
|
||||||
]
|
|
||||||
course_link.click()
|
|
||||||
|
|
||||||
try:
|
|
||||||
wait(
|
|
||||||
EC.element_to_be_clickable(
|
|
||||||
(By.CSS_SELECTOR, ".modal-header button")
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
}
|
||||||
transfer = [
|
curr_page = int(
|
||||||
scrape_course_card("lblSendCourseEQDetail", i, False)
|
|
||||||
for i in range(
|
|
||||||
0,
|
|
||||||
len(
|
|
||||||
driver.find_element(
|
|
||||||
"id", "lblSendCourseEQDetail"
|
|
||||||
).find_elements(By.CSS_SELECTOR, ".course-detail")
|
|
||||||
),
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
rpi = [
|
|
||||||
scrape_course_card("lblReceiveCourseEQDetail", i, True)
|
|
||||||
for i in range(
|
|
||||||
0,
|
|
||||||
len(
|
|
||||||
driver.find_element(
|
|
||||||
"id", "lblReceiveCourseEQDetail"
|
|
||||||
).find_elements(By.CSS_SELECTOR, ".course-detail")
|
|
||||||
),
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
|
|
||||||
begin_date = driver.find_element(
|
|
||||||
"id", "lblBeginEffectiveDate"
|
|
||||||
).text
|
|
||||||
end_date = driver.find_element("id", "lblEndEffectiveDate").text
|
|
||||||
|
|
||||||
driver.find_element(
|
driver.find_element(
|
||||||
By.CSS_SELECTOR, ".modal-header button"
|
"id", "lblInstWithEQPaginationInfo"
|
||||||
).click()
|
).text.split()[-3]
|
||||||
|
)
|
||||||
|
if state["inst_pg"] in jumpable_pages:
|
||||||
|
jumpable_pages[state["inst_pg"]].click()
|
||||||
|
curr_page = state["inst_pg"]
|
||||||
|
elif state["inst_pg"] < min(jumpable_pages):
|
||||||
|
jumpable_pages[min(jumpable_pages)].click()
|
||||||
|
curr_page = min(jumpable_pages)
|
||||||
|
else:
|
||||||
|
jumpable_pages[max(jumpable_pages)].click()
|
||||||
|
curr_page = max(jumpable_pages)
|
||||||
|
|
||||||
courses += [
|
wait(EC.staleness_of(page))
|
||||||
{
|
sleep(random.uniform(3, 6))
|
||||||
"transfer": transfer,
|
page = driver.find_element("id", f"gdvInstWithEQ")
|
||||||
"rpi": rpi,
|
|
||||||
"begin": begin_date,
|
|
||||||
"end": end_date,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
state["course_idx"] += 1
|
|
||||||
except Exception as e:
|
|
||||||
institutions.update(
|
|
||||||
{
|
|
||||||
inst_name: {
|
|
||||||
"city": city,
|
|
||||||
"state": us_state,
|
|
||||||
"courses": courses,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
raise e
|
|
||||||
state["course_idx"] = 0
|
|
||||||
state["course_pg"] += 1
|
|
||||||
institutions.update(
|
|
||||||
{inst_name: {"city": city, "state": us_state, "courses": courses}}
|
|
||||||
)
|
|
||||||
state["course_pg"] = 1
|
|
||||||
state["inst_idx"] += 1
|
|
||||||
|
|
||||||
driver.find_element("id", "btnSwitchView").click()
|
inst_list_len = len(
|
||||||
wait(
|
page.find_elements(
|
||||||
EC.text_to_be_present_in_element(
|
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
||||||
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
state["inst_idx"] = 0
|
while state["inst_idx"] < inst_list_len:
|
||||||
state["inst_pg"] = (state["inst_pg"] % num_pages) + 1
|
institution_link = driver.find_element(
|
||||||
|
"id", "gdvInstWithEQ"
|
||||||
|
).find_elements(
|
||||||
|
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
||||||
|
)[
|
||||||
|
state["inst_idx"]
|
||||||
|
]
|
||||||
|
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
|
||||||
|
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
|
||||||
|
)
|
||||||
|
inst_name = institution_link.text.title().strip()
|
||||||
|
city = fields[0].text.title().strip()
|
||||||
|
us_state = fields[1].text.strip()
|
||||||
|
|
||||||
except (Exception, KeyboardInterrupt) as e:
|
institution_link.click()
|
||||||
print("Program hits exception and will save and terminate", file=sys.stderr)
|
wait(EC.staleness_of(institution_link))
|
||||||
print(traceback.format_exc(), file=sys.stderr)
|
|
||||||
|
|
||||||
print("Program will terminate with state: ", end="", file=sys.stderr)
|
try:
|
||||||
json.dump(state, sys.stderr, indent=4)
|
course_pages_len = int(
|
||||||
print("", file=sys.stderr)
|
driver.find_element(
|
||||||
with open(transfer_json_path, "w") as transferjson:
|
"id", "lblInstWithEQPaginationInfo"
|
||||||
json.dump(institutions, transferjson, indent=4)
|
).text.split()[-1]
|
||||||
with open(state_json_path, "w") as statejson:
|
)
|
||||||
json.dump(state, statejson, indent=4)
|
except NoSuchElementException:
|
||||||
driver.quit()
|
course_pages_len = 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
courses = institutions[inst_name]["courses"]
|
||||||
|
except:
|
||||||
|
courses = []
|
||||||
|
|
||||||
|
while state["course_pg"] <= course_pages_len:
|
||||||
|
course_links_len = len(
|
||||||
|
driver.find_element("id", "gdvCourseEQ").find_elements(
|
||||||
|
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
while state["course_idx"] < course_links_len:
|
||||||
|
course_link = driver.find_element(
|
||||||
|
"id", "gdvCourseEQ"
|
||||||
|
).find_elements(
|
||||||
|
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
|
||||||
|
)[
|
||||||
|
state["course_idx"]
|
||||||
|
]
|
||||||
|
course_link.click()
|
||||||
|
|
||||||
|
try:
|
||||||
|
wait(
|
||||||
|
EC.element_to_be_clickable(
|
||||||
|
(By.CSS_SELECTOR, ".modal-header button")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
transfer = [
|
||||||
|
scrape_course_card("lblSendCourseEQDetail", i, False)
|
||||||
|
for i in range(
|
||||||
|
0,
|
||||||
|
len(
|
||||||
|
driver.find_element(
|
||||||
|
"id", "lblSendCourseEQDetail"
|
||||||
|
).find_elements(
|
||||||
|
By.CSS_SELECTOR, ".course-detail"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
rpi = [
|
||||||
|
scrape_course_card("lblReceiveCourseEQDetail", i, True)
|
||||||
|
for i in range(
|
||||||
|
0,
|
||||||
|
len(
|
||||||
|
driver.find_element(
|
||||||
|
"id", "lblReceiveCourseEQDetail"
|
||||||
|
).find_elements(
|
||||||
|
By.CSS_SELECTOR, ".course-detail"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
begin_date = driver.find_element(
|
||||||
|
"id", "lblBeginEffectiveDate"
|
||||||
|
).text
|
||||||
|
end_date = driver.find_element(
|
||||||
|
"id", "lblEndEffectiveDate"
|
||||||
|
).text
|
||||||
|
|
||||||
|
driver.find_element(
|
||||||
|
By.CSS_SELECTOR, ".modal-header button"
|
||||||
|
).click()
|
||||||
|
|
||||||
|
courses += [
|
||||||
|
{
|
||||||
|
"transfer": transfer,
|
||||||
|
"rpi": rpi,
|
||||||
|
"begin": begin_date,
|
||||||
|
"end": end_date,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
state["course_idx"] += 1
|
||||||
|
except Exception as e:
|
||||||
|
institutions.update(
|
||||||
|
{
|
||||||
|
inst_name: {
|
||||||
|
"city": city,
|
||||||
|
"state": us_state,
|
||||||
|
"courses": courses,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
raise e
|
||||||
|
state["course_idx"] = 0
|
||||||
|
state["course_pg"] += 1
|
||||||
|
institutions.update(
|
||||||
|
{inst_name: {"city": city, "state": us_state, "courses": courses}}
|
||||||
|
)
|
||||||
|
state["course_pg"] = 1
|
||||||
|
state["inst_idx"] += 1
|
||||||
|
|
||||||
|
driver.find_element("id", "btnSwitchView").click()
|
||||||
|
wait(
|
||||||
|
EC.text_to_be_present_in_element(
|
||||||
|
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
state["inst_idx"] = 0
|
||||||
|
state["inst_pg"] = (state["inst_pg"] % num_pages) + 1
|
||||||
|
|
||||||
|
except (Exception, KeyboardInterrupt) as e:
|
||||||
|
print("Program hits exception and will save and terminate", file=sys.stderr)
|
||||||
|
print(traceback.format_exc(), file=sys.stderr)
|
||||||
|
|
||||||
|
print("Program will terminate with state: ", end="", file=sys.stderr)
|
||||||
|
json.dump(state, sys.stderr, indent=4)
|
||||||
|
print("", file=sys.stderr)
|
||||||
|
with open(transfer_json_path, "w") as transferjson:
|
||||||
|
json.dump(institutions, transferjson, indent=4)
|
||||||
|
with open(state_json_path, "w") as statejson:
|
||||||
|
json.dump(state, statejson, indent=4)
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
Loading…
Reference in a new issue