mirror of
https://github.com/quatalog/quatalog.git
synced 2024-11-15 19:42:44 +00:00
Compare commits
16 commits
976b553b14
...
a0b9081f8f
Author | SHA1 | Date | |
---|---|---|---|
a0b9081f8f | |||
4f69c1d8a0 | |||
56c9268398 | |||
02b383b90b | |||
95e8238786 | |||
fc72fda5de | |||
e45318404d | |||
10715c89e3 | |||
52fdab6ce6 | |||
42dbf3c19a | |||
985f40c4e7 | |||
cb24d84b46 | |||
ce2f22b23b | |||
c8eadc06ee | |||
6ad6f85708 | |||
acdd08168f |
110
.github/workflows/transfer.yml
vendored
110
.github/workflows/transfer.yml
vendored
|
@ -1,37 +1,32 @@
|
||||||
name: Scrape transfer and update file
|
name: Scrape transfer and update file
|
||||||
run-name: Scrape transfer and update file
|
run-name: Scrape transfer and update file
|
||||||
env:
|
|
||||||
DEFAULT_TIMEOUT: 45
|
|
||||||
on:
|
on:
|
||||||
# schedule:
|
|
||||||
# - cron: '*/15 * * * *'
|
|
||||||
repository_dispatch:
|
|
||||||
types: transfer-scraper
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
|
||||||
timeout:
|
|
||||||
description: "Timeout time"
|
|
||||||
required: true
|
|
||||||
type: number
|
|
||||||
default: 2
|
|
||||||
concurrency:
|
|
||||||
group: transfer-scraper
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
scrape-data:
|
setup:
|
||||||
name: Scrape transfer guide
|
name: Get number of pages and set up scrape page jobs
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Create matrix parameters
|
||||||
|
id: matrix-params
|
||||||
|
run: |
|
||||||
|
NUM_PAGES="$(curl -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0' 'https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce' | grep -e 'lblInstWithEQPaginationInfo' | grep -Poie '(?<=of )[0-9]*')"
|
||||||
|
MATRIX_PARAMS="$(seq -s "," 1 "$NUM_PAGES")"
|
||||||
|
MATRIX_PARAMS="\"page\": $(sed -e 's/,/}, {"page": /g' <<< "$MATRIX_PARAMS")"
|
||||||
|
echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT
|
||||||
|
outputs:
|
||||||
|
matrix-params: ${{ steps.matrix-params.outputs.matrix-params }}
|
||||||
|
scrape-page:
|
||||||
|
name: Scrape page
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: setup
|
||||||
|
strategy:
|
||||||
|
matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }}
|
||||||
|
fail-fast: false
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout scraping repo
|
- name: Checkout scraping repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
|
||||||
path: quatalog-scraping
|
|
||||||
|
|
||||||
- name: Checkout data repo
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
repository: quatalog/data
|
|
||||||
path: data
|
|
||||||
|
|
||||||
- name: Set up python
|
- name: Set up python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
|
@ -40,74 +35,19 @@ jobs:
|
||||||
cache: 'pip'
|
cache: 'pip'
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
working-directory: quatalog-scraping/transfer_scraper
|
working-directory: transfer_scraper
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install -r 'requirements.txt'
|
pip install -r 'requirements.txt'
|
||||||
|
|
||||||
- name: Log IP
|
- name: Run scraper
|
||||||
|
working-directory: transfer_scraper
|
||||||
run: |
|
run: |
|
||||||
echo "Public IP: $(curl -s 'https://ipinfo.io/ip')"
|
python3 main.py ${{ matrix.page }} transfer_${{ matrix.page }}.json
|
||||||
|
|
||||||
- name: Copy data to temp dir
|
|
||||||
run: |
|
|
||||||
mkdir new-data
|
|
||||||
dd status=progress if='data/transfer.json' of='new-data/transfer.json'
|
|
||||||
dd status=progress if='data/transfer_state.json' of='new-data/transfer_state.json'
|
|
||||||
|
|
||||||
- name: Scrape transfer guide
|
|
||||||
run: |
|
|
||||||
python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout || env.DEFAULT_TIMEOUT }}
|
|
||||||
|
|
||||||
- name: Upload data to artifact
|
- name: Upload data to artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: transfer-data
|
name: transfer-page-${{ matrix.page }}
|
||||||
path: new-data/
|
path: transfer_scraper/transfer_${{ matrix.page }}.json
|
||||||
|
|
||||||
push-new-data:
|
|
||||||
name: Push new data to data repo
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs: [scrape-data]
|
|
||||||
steps:
|
|
||||||
- name: Clone Quatalog data
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
repository: quatalog/data
|
|
||||||
path: quatalog-data
|
|
||||||
token: ${{ secrets.PUSH_TOKEN }}
|
|
||||||
|
|
||||||
- name: Download data from artifact
|
|
||||||
uses: actions/download-artifact@v4
|
|
||||||
with:
|
|
||||||
name: transfer-data
|
|
||||||
path: data
|
|
||||||
|
|
||||||
- name: Copy data to repo directory
|
|
||||||
run: |
|
|
||||||
ls -lsa data
|
|
||||||
dd status=progress if='data/transfer.json' of='quatalog-data/transfer.json'
|
|
||||||
dd status=progress if='data/transfer_state.json' of='quatalog-data/transfer_state.json'
|
|
||||||
|
|
||||||
- name: Push new data
|
|
||||||
working-directory: quatalog-data
|
|
||||||
run: |
|
|
||||||
git config user.name "Quatalog Updater"
|
|
||||||
git config user.email "github_actions@quatalog.com"
|
|
||||||
git add transfer.json transfer_state.json
|
|
||||||
git commit -m "$(date)" || exit 0
|
|
||||||
git push
|
|
||||||
|
|
||||||
re-run-scraper:
|
|
||||||
name: Tell Github to run this workflow again
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs: [push-new-data]
|
|
||||||
steps:
|
|
||||||
- name: Tell Github to run this workflow again
|
|
||||||
run: |
|
|
||||||
curl -L \
|
|
||||||
-H "Accept: application/vnd.github+json" \
|
|
||||||
-H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \
|
|
||||||
--request POST \
|
|
||||||
--data '{"event_type": "transfer-scraper"}' \
|
|
||||||
"https://api.github.com/repos/quatalog/quatalog/dispatches"
|
|
||||||
|
|
|
@ -3,22 +3,17 @@ import html
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import os.path
|
import os.path
|
||||||
import traceback
|
|
||||||
from time import sleep
|
from time import sleep
|
||||||
import random
|
import random
|
||||||
from signal import alarm, SIGALRM, signal
|
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.common.exceptions import StaleElementReferenceException
|
from selenium.common.exceptions import (
|
||||||
from selenium.common.exceptions import TimeoutException
|
StaleElementReferenceException,
|
||||||
from selenium.common.exceptions import NoSuchElementException
|
NoSuchElementException,
|
||||||
|
)
|
||||||
|
|
||||||
def raise_(ex):
|
|
||||||
raise ex
|
|
||||||
|
|
||||||
|
|
||||||
# Fix course titles accounting for Roman numerals up to X
|
# Fix course titles accounting for Roman numerals up to X
|
||||||
|
@ -29,84 +24,36 @@ def normalize_title(input):
|
||||||
return s.strip()
|
return s.strip()
|
||||||
|
|
||||||
|
|
||||||
|
# Waits until EC plus some random wait time
|
||||||
def wait(ec):
|
def wait(ec):
|
||||||
global driver
|
global driver
|
||||||
|
|
||||||
WebDriverWait(
|
WebDriverWait(
|
||||||
driver, 20, ignored_exceptions=[StaleElementReferenceException]
|
driver, 40, ignored_exceptions=[StaleElementReferenceException]
|
||||||
).until(ec)
|
).until(ec)
|
||||||
sleep(random.uniform(400, 1900) / 1000)
|
sleep(random.uniform(400, 1900) / 1000)
|
||||||
|
|
||||||
|
|
||||||
def scrape_course_card(html_id, i, note):
|
# jump_to_page: navigates to a paginated page on this insufferable website
|
||||||
|
#
|
||||||
|
# curr_page: the current page number
|
||||||
|
# to_page: the page number to jump to
|
||||||
|
# num_pages: the total number of pages
|
||||||
|
# postback_type: javascript:__doPostBack('<this field>','Page$3')
|
||||||
|
# pagination_type: <span id="<this field>">PAGE 1 OF 27<br></span>
|
||||||
|
def jump_to_page(curr_page, to_page, postback_type, pagination_type):
|
||||||
global driver
|
global driver
|
||||||
|
|
||||||
trs = (
|
page = driver.find_element(By.ID, postback_type)
|
||||||
driver.find_element("id", html_id)
|
|
||||||
.find_elements(By.CSS_SELECTOR, ".course-detail")[i]
|
|
||||||
.find_elements(By.TAG_NAME, "tr")
|
|
||||||
)
|
|
||||||
course_name_and_id = trs[0].text.split()
|
|
||||||
|
|
||||||
course_desc = ""
|
|
||||||
if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
|
|
||||||
course_desc = trs[1].text
|
|
||||||
|
|
||||||
course_department = normalize_title(
|
|
||||||
next((x for x in trs if x.text.strip().startswith("Department:")))
|
|
||||||
.find_elements(By.TAG_NAME, "td")[1]
|
|
||||||
.text
|
|
||||||
)
|
|
||||||
course_catalog = normalize_title(
|
|
||||||
next((x for x in trs if x.text.strip().startswith("Source catalog:")))
|
|
||||||
.find_elements(By.TAG_NAME, "td")[1]
|
|
||||||
.text
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
k = 1 + next(
|
num_pages = int(driver.find_element(By.ID, pagination_type).text.split()[-1])
|
||||||
i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v))
|
except NoSuchElementException:
|
||||||
)
|
|
||||||
course_id = " ".join(course_name_and_id[0:k])
|
|
||||||
course_name = normalize_title(" ".join(course_name_and_id[k:]))
|
|
||||||
except StopIteration: # Handling for Not Transferrable
|
|
||||||
course_id = course_name_and_id[0]
|
|
||||||
course_name = normalize_title(" ".join(course_name_and_id[1:]))
|
|
||||||
|
|
||||||
if not note:
|
|
||||||
try:
|
|
||||||
course_credits = (
|
|
||||||
next((x for x in trs if x.text.strip().startswith("Units:")))
|
|
||||||
.find_elements(By.TAG_NAME, "td")[1]
|
|
||||||
.text.strip()
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
course_credits = ""
|
|
||||||
|
|
||||||
return {
|
|
||||||
"id": course_id,
|
|
||||||
"name": course_name,
|
|
||||||
"credits": course_credits,
|
|
||||||
"desc": course_desc,
|
|
||||||
"department": course_department,
|
|
||||||
"catalog": course_catalog,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
course_note = driver.find_element("id", "lblCommentsPublic").text.strip()
|
|
||||||
return {
|
|
||||||
"id": course_id,
|
|
||||||
"name": course_name,
|
|
||||||
"note": course_note,
|
|
||||||
"desc": course_desc,
|
|
||||||
"department": course_department,
|
|
||||||
"catalog": course_catalog,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
|
|
||||||
page = driver.find_element("id", postback_type)
|
|
||||||
if num_pages == 1:
|
|
||||||
return 1, page
|
return 1, page
|
||||||
|
|
||||||
|
if to_page > num_pages or to_page < 1:
|
||||||
|
raise ValueError(
|
||||||
|
f"to_page was out of range ({to_page} not in [1, {num_pages}])"
|
||||||
|
)
|
||||||
while curr_page != to_page:
|
while curr_page != to_page:
|
||||||
jumpable_pages = {
|
jumpable_pages = {
|
||||||
int(x.get_attribute("href").split("'")[3][5:]): x
|
int(x.get_attribute("href").split("'")[3][5:]): x
|
||||||
|
@ -117,7 +64,7 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
|
||||||
+ """','Page$"]""",
|
+ """','Page$"]""",
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
curr_page = int(driver.find_element("id", pagination_type).text.split()[-3])
|
curr_page = int(driver.find_element(By.ID, pagination_type).text.split()[-3])
|
||||||
if to_page in jumpable_pages:
|
if to_page in jumpable_pages:
|
||||||
jumpable_pages[to_page].click()
|
jumpable_pages[to_page].click()
|
||||||
curr_page = to_page
|
curr_page = to_page
|
||||||
|
@ -127,251 +74,183 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
|
||||||
else:
|
else:
|
||||||
jumpable_pages[max(jumpable_pages)].click()
|
jumpable_pages[max(jumpable_pages)].click()
|
||||||
curr_page = max(jumpable_pages)
|
curr_page = max(jumpable_pages)
|
||||||
print(f"Jumping to {postback_type} page {curr_page}", file=sys.stderr)
|
|
||||||
|
|
||||||
wait(EC.staleness_of(page))
|
wait(EC.staleness_of(page))
|
||||||
sleep(random.uniform(400, 1900) / 1000)
|
sleep(random.uniform(400, 1900) / 1000)
|
||||||
page = driver.find_element("id", postback_type)
|
page = driver.find_element(By.ID, postback_type)
|
||||||
return curr_page, page
|
return curr_page, page
|
||||||
|
|
||||||
|
|
||||||
def main():
|
# scrape_page: Scrapes a page of institutions
|
||||||
|
#
|
||||||
|
# page_num: The page to scrape.
|
||||||
|
# Note that the current page before running this function must be 1.
|
||||||
|
def scrape_page(page_num):
|
||||||
global driver
|
global driver
|
||||||
|
global options
|
||||||
if len(sys.argv) != 3 and len(sys.argv) != 4:
|
|
||||||
print(
|
|
||||||
f"USAGE: python {sys.argv[0]} <transfer file> <state file> [timeout minutes]"
|
|
||||||
)
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
transfer_json_path = sys.argv[1]
|
|
||||||
state_json_path = sys.argv[2]
|
|
||||||
timeout_seconds = int(sys.argv[3] if len(sys.argv) == 4 else 120) * 60
|
|
||||||
|
|
||||||
# Set up timeout so that the GH action does not run forever, pretend it's ^C
|
|
||||||
print(f"Setting timeout to {timeout_seconds} seconds", file=sys.stderr)
|
|
||||||
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
|
|
||||||
alarm(timeout_seconds)
|
|
||||||
|
|
||||||
options = webdriver.FirefoxOptions()
|
|
||||||
options.add_argument("--headless")
|
|
||||||
|
|
||||||
user_agent = UserAgent().random
|
|
||||||
options.set_preference("general.useragent.override", user_agent)
|
|
||||||
# options.set_preference("network.proxy.socks", "")
|
|
||||||
# options.set_preference("network.proxy.socks_port", )
|
|
||||||
# options.set_preference("network.proxy.socks_remote_dns", True)
|
|
||||||
# options.set_preference("network.proxy.type", 1)
|
|
||||||
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
|
|
||||||
|
|
||||||
driver = webdriver.Firefox(options=options)
|
driver = webdriver.Firefox(options=options)
|
||||||
driver.get(
|
driver.get(
|
||||||
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
||||||
)
|
)
|
||||||
|
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
|
||||||
|
|
||||||
print(
|
num_institutions = len(
|
||||||
f'Title is {driver.find_element(By.TAG_NAME, "title").get_attribute("innerText").strip()}',
|
driver.find_elements(
|
||||||
file=sys.stderr,
|
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
num_pages = int(
|
print(f"Scraping page {page_num}, found {num_institutions} links", file=sys.stderr)
|
||||||
driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1]
|
return [scrape_institution(i, page_num) for i in range(0, num_institutions)]
|
||||||
|
|
||||||
|
|
||||||
|
# scrape_institution: Scrapes an institution by index.
|
||||||
|
#
|
||||||
|
# index: the 0-indexed index of the instituion to scrape on the page we are on.
|
||||||
|
def scrape_institution(index, page_num):
|
||||||
|
global driver
|
||||||
|
global options
|
||||||
|
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
driver.get(
|
||||||
|
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
||||||
)
|
)
|
||||||
print(f"{num_pages} pages detected", file=sys.stderr)
|
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
|
||||||
|
|
||||||
state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0}
|
inst_link = driver.find_element(
|
||||||
institutions = {}
|
By.ID, f"gdvInstWithEQ_btnCreditFromInstName_{index}"
|
||||||
if os.path.isfile(state_json_path):
|
)
|
||||||
with open(state_json_path, "r") as statejson:
|
[inst_name, inst_city, inst_state, _] = [
|
||||||
state = json.load(statejson)
|
e.text
|
||||||
if os.path.isfile(transfer_json_path):
|
for e in inst_link.find_element(By.XPATH, "../..").find_elements(
|
||||||
with open(transfer_json_path, "r") as transferjson:
|
By.TAG_NAME, "td"
|
||||||
institutions = json.load(transferjson)
|
)
|
||||||
|
]
|
||||||
|
inst_name, inst_city = normalize_title(inst_name), normalize_title(inst_city)
|
||||||
|
inst_link.click()
|
||||||
|
wait(EC.staleness_of(inst_link))
|
||||||
|
print(f"Scraping {inst_name} ({inst_city}, {inst_state})", file=sys.stderr)
|
||||||
|
|
||||||
print("Loaded state: ", end="", file=sys.stderr)
|
# Add all courses
|
||||||
json.dump(state, sys.stderr, indent=4)
|
try:
|
||||||
print("", file=sys.stderr)
|
num_pages = int(
|
||||||
|
driver.find_element(By.ID, "lblCourseEQPaginationInfo").text.split()[-1]
|
||||||
if state["inst_pg"] > num_pages:
|
)
|
||||||
raise Exception
|
except NoSuchElementException:
|
||||||
|
num_pages = 1
|
||||||
|
|
||||||
try:
|
try:
|
||||||
curr_inst_page = 1
|
for i in range(1, num_pages + 1):
|
||||||
while state["inst_pg"] <= num_pages:
|
jump_to_page(max(1, i - 1), i, "gdvCourseEQ", "lblCourseEQPaginationInfo")
|
||||||
curr_inst_page, page = jump_to_page(
|
driver.find_element(By.ID, "gdvCourseEQ_cbxHeaderCheckAll").click()
|
||||||
curr_inst_page,
|
except NoSuchElementException:
|
||||||
state["inst_pg"],
|
# Institution has no data
|
||||||
num_pages,
|
return {
|
||||||
"gdvInstWithEQ",
|
"institution": inst_name,
|
||||||
"lblInstWithEQPaginationInfo",
|
"city": inst_city,
|
||||||
)
|
"state": inst_state,
|
||||||
|
"courses": [],
|
||||||
|
}
|
||||||
|
|
||||||
inst_list_len = len(
|
# Open list
|
||||||
page.find_elements(
|
driver.find_element(By.ID, "btnAddToMyEQList").click()
|
||||||
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
wait(EC.visibility_of_element_located((By.ID, "gdvMyCourseEQList")))
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
while state["inst_idx"] < inst_list_len:
|
# Scrape list
|
||||||
institution_link = driver.find_element(
|
tds = driver.find_element(By.ID, "gdvMyCourseEQList").find_elements(
|
||||||
"id", "gdvInstWithEQ"
|
By.TAG_NAME, "td"
|
||||||
).find_elements(
|
)
|
||||||
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
|
|
||||||
)[
|
|
||||||
state["inst_idx"]
|
|
||||||
]
|
|
||||||
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
|
|
||||||
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
|
|
||||||
)
|
|
||||||
inst_name = normalize_title(institution_link.text)
|
|
||||||
city = normalize_title(fields[0].text)
|
|
||||||
us_state = fields[1].text.strip()
|
|
||||||
|
|
||||||
institution_link.click()
|
transfer_courses = [
|
||||||
wait(EC.staleness_of(institution_link))
|
{
|
||||||
|
"transfer": parse_course_td(transfer_course),
|
||||||
|
"rpi": parse_course_td(rpi_course, note.text.strip()),
|
||||||
|
"begin": begin.text.strip(),
|
||||||
|
"end": end.text.strip(),
|
||||||
|
}
|
||||||
|
for transfer_course, rpi_course, note, begin, end, _ in zip(
|
||||||
|
*[iter(x for x in tds)] * 6
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
try:
|
driver.quit()
|
||||||
course_pages_len = int(
|
|
||||||
driver.find_element(
|
|
||||||
"id", "lblCourseEQPaginationInfo"
|
|
||||||
).text.split()[-1]
|
|
||||||
)
|
|
||||||
except NoSuchElementException:
|
|
||||||
course_pages_len = 1
|
|
||||||
|
|
||||||
try:
|
return {
|
||||||
courses = institutions[inst_name]["courses"]
|
"institution": inst_name,
|
||||||
except Exception:
|
"city": inst_city,
|
||||||
courses = []
|
"state": inst_state,
|
||||||
|
"courses": transfer_courses,
|
||||||
|
}
|
||||||
|
|
||||||
curr_course_page = 1
|
|
||||||
while state["course_pg"] <= course_pages_len:
|
|
||||||
curr_course_page, page = jump_to_page(
|
|
||||||
curr_course_page,
|
|
||||||
state["course_pg"],
|
|
||||||
course_pages_len,
|
|
||||||
"gdvCourseEQ",
|
|
||||||
"lblCourseEQPaginationInfo",
|
|
||||||
)
|
|
||||||
|
|
||||||
course_links_len = len(
|
def parse_course_td(td, note=None):
|
||||||
page.find_elements(
|
course_info = (
|
||||||
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
|
html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0].split()
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
while state["course_idx"] < course_links_len:
|
# Not all schools use the same course code format, so this figures out how long
|
||||||
course_link = driver.find_element(
|
# it is if it exists, it will not exist for Not Transferrable.
|
||||||
"id", "gdvCourseEQ"
|
try:
|
||||||
).find_elements(
|
course_id_delim = 1 + list(
|
||||||
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]"
|
bool(re.search(r"\d", s)) for s in course_info
|
||||||
)[
|
).index(True)
|
||||||
state["course_idx"]
|
except ValueError:
|
||||||
]
|
course_id_delim = 1
|
||||||
course_link.click()
|
|
||||||
|
|
||||||
try:
|
# Same deal with credit counts.
|
||||||
wait(
|
try:
|
||||||
EC.element_to_be_clickable(
|
cr_delim = (
|
||||||
(By.CSS_SELECTOR, ".modal-header button")
|
len(course_info)
|
||||||
),
|
- 1
|
||||||
)
|
- list(bool(re.search(r"\(", s)) for s in course_info[::-1]).index(True)
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
cr_delim = len(course_info)
|
||||||
|
|
||||||
transfer = [
|
# note serves as a credit count override, since the RPI-side credit counts
|
||||||
scrape_course_card("lblSendCourseEQDetail", i, False)
|
# are inaccurate
|
||||||
for i in range(
|
out = {
|
||||||
0,
|
"id": " ".join(course_info[:course_id_delim]),
|
||||||
len(
|
"name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
|
||||||
driver.find_element(
|
"catalog": td.find_element(By.TAG_NAME, "span").text,
|
||||||
"id", "lblSendCourseEQDetail"
|
}
|
||||||
).find_elements(
|
if note is None:
|
||||||
By.CSS_SELECTOR, ".course-detail"
|
out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1])}),
|
||||||
)
|
return out
|
||||||
),
|
else:
|
||||||
)
|
out.update({"note": note})
|
||||||
]
|
return out
|
||||||
|
|
||||||
rpi = [
|
|
||||||
scrape_course_card("lblReceiveCourseEQDetail", i, True)
|
|
||||||
for i in range(
|
|
||||||
0,
|
|
||||||
len(
|
|
||||||
driver.find_element(
|
|
||||||
"id", "lblReceiveCourseEQDetail"
|
|
||||||
).find_elements(
|
|
||||||
By.CSS_SELECTOR, ".course-detail"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
print(
|
def main():
|
||||||
f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})",
|
global driver
|
||||||
file=sys.stderr,
|
global options
|
||||||
)
|
|
||||||
|
|
||||||
begin_date = driver.find_element(
|
if len(sys.argv) != 3:
|
||||||
"id", "lblBeginEffectiveDate"
|
print(
|
||||||
).text
|
f"USAGE: python {sys.argv[0]} <page number to scrape> <output file>",
|
||||||
end_date = driver.find_element(
|
file=sys.stderr,
|
||||||
"id", "lblEndEffectiveDate"
|
)
|
||||||
).text
|
return 1
|
||||||
|
|
||||||
driver.find_element(
|
PAGE_NUM_TO_SCRAPE = int(sys.argv[1])
|
||||||
By.CSS_SELECTOR, ".modal-header button"
|
OUT_FILENAME = sys.argv[2]
|
||||||
).click()
|
|
||||||
|
|
||||||
courses += [
|
print(f"Setting up selenium Firefox emulator", file=sys.stderr)
|
||||||
{
|
options = webdriver.FirefoxOptions()
|
||||||
"transfer": transfer,
|
options.add_argument("--headless")
|
||||||
"rpi": rpi,
|
|
||||||
"begin": begin_date,
|
|
||||||
"end": end_date,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
state["course_idx"] += 1
|
|
||||||
except (Exception, KeyboardInterrupt) as e:
|
|
||||||
institutions.update(
|
|
||||||
{
|
|
||||||
inst_name: {
|
|
||||||
"city": city,
|
|
||||||
"state": us_state,
|
|
||||||
"courses": courses,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
raise e
|
|
||||||
state["course_idx"] = 0
|
|
||||||
state["course_pg"] += 1
|
|
||||||
|
|
||||||
institutions.update(
|
user_agent = UserAgent().random
|
||||||
{inst_name: {"city": city, "state": us_state, "courses": courses}}
|
options.set_preference("general.useragent.override", user_agent)
|
||||||
)
|
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
|
||||||
state["course_pg"] = 1
|
|
||||||
state["inst_idx"] += 1
|
|
||||||
|
|
||||||
driver.find_element("id", "btnSwitchView").click()
|
with open(OUT_FILENAME, "w") as transferjson:
|
||||||
wait(
|
json.dump(scrape_page(PAGE_NUM_TO_SCRAPE), transferjson, indent=4)
|
||||||
EC.text_to_be_present_in_element(
|
|
||||||
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
|
|
||||||
),
|
|
||||||
)
|
|
||||||
state["inst_idx"] = 0
|
|
||||||
state["inst_pg"] += 1
|
|
||||||
|
|
||||||
except (Exception, KeyboardInterrupt) as e:
|
|
||||||
print("Program hits exception and will save and terminate", file=sys.stderr)
|
|
||||||
print(traceback.format_exc(), file=sys.stderr)
|
|
||||||
|
|
||||||
print("Program will terminate with state: ", end="", file=sys.stderr)
|
|
||||||
json.dump(state, sys.stderr, indent=4)
|
|
||||||
print("", file=sys.stderr)
|
|
||||||
with open(transfer_json_path, "w") as transferjson:
|
|
||||||
json.dump(institutions, transferjson, indent=4)
|
|
||||||
with open(state_json_path, "w") as statejson:
|
|
||||||
json.dump(state, statejson, indent=4)
|
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
exit(main())
|
||||||
|
|
Loading…
Reference in a new issue