Compare commits

..

16 commits

Author SHA1 Message Date
powe97 a0b9081f8f
--headless 2024-03-05 21:14:32 -05:00
powe97 4f69c1d8a0
Re-get the page to try circumvent timeout 2024-03-05 21:14:00 -05:00
powe97 56c9268398
Disable fail-fast 2024-03-05 20:49:08 -05:00
powe97 02b383b90b
Extend timeout 2024-03-05 20:47:41 -05:00
powe97 95e8238786
Merge branch 'main' of https://github.com/quatalog/quatalog 2024-03-05 19:10:16 -05:00
powe97 fc72fda5de
Remove jump debug print 2024-03-05 19:10:10 -05:00
powe97 e45318404d
Update transfer.yml 2024-03-05 19:06:49 -05:00
powe97 10715c89e3
Update transfer.yml 2024-03-05 19:05:41 -05:00
powe97 52fdab6ce6
Make everything stderr print 2024-03-05 19:03:54 -05:00
powe97 42dbf3c19a
Update transfer.yml 2024-03-05 18:46:02 -05:00
powe97 985f40c4e7
Set up matrix jobs 2024-03-05 18:42:05 -05:00
powe97 cb24d84b46
Merge branch 'main' of https://github.com/quatalog/quatalog 2024-03-05 18:38:17 -05:00
powe97 ce2f22b23b
Merge branch 'main' of https://github.com/quatalog/quatalog 2024-03-05 18:38:12 -05:00
powe97 c8eadc06ee
Merge branch 'main' of https://github.com/quatalog/quatalog 2024-03-05 18:34:02 -05:00
powe97 6ad6f85708
Redesign scraper to not be unbearably slow 2024-03-05 18:33:54 -05:00
powe97 acdd08168f
Update transfer.yml 2024-03-05 18:27:51 -05:00
2 changed files with 186 additions and 367 deletions

View file

@ -1,37 +1,32 @@
name: Scrape transfer and update file name: Scrape transfer and update file
run-name: Scrape transfer and update file run-name: Scrape transfer and update file
env:
DEFAULT_TIMEOUT: 45
on: on:
# schedule:
# - cron: '*/15 * * * *'
repository_dispatch:
types: transfer-scraper
workflow_dispatch: workflow_dispatch:
inputs:
timeout:
description: "Timeout time"
required: true
type: number
default: 2
concurrency:
group: transfer-scraper
jobs: jobs:
scrape-data: setup:
name: Scrape transfer guide name: Get number of pages and set up scrape page jobs
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps:
- name: Create matrix parameters
id: matrix-params
run: |
NUM_PAGES="$(curl -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0' 'https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce' | grep -e 'lblInstWithEQPaginationInfo' | grep -Poie '(?<=of )[0-9]*')"
MATRIX_PARAMS="$(seq -s "," 1 "$NUM_PAGES")"
MATRIX_PARAMS="\"page\": $(sed -e 's/,/}, {"page": /g' <<< "$MATRIX_PARAMS")"
echo "matrix-params={\"include\": [{"$MATRIX_PARAMS"}]}" | tee $GITHUB_OUTPUT
outputs:
matrix-params: ${{ steps.matrix-params.outputs.matrix-params }}
scrape-page:
name: Scrape page
runs-on: ubuntu-latest
needs: setup
strategy:
matrix: ${{ fromJson(needs.setup.outputs.matrix-params) }}
fail-fast: false
steps: steps:
- name: Checkout scraping repo - name: Checkout scraping repo
uses: actions/checkout@v4 uses: actions/checkout@v4
with:
path: quatalog-scraping
- name: Checkout data repo
uses: actions/checkout@v4
with:
repository: quatalog/data
path: data
- name: Set up python - name: Set up python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
@ -40,74 +35,19 @@ jobs:
cache: 'pip' cache: 'pip'
- name: Install dependencies - name: Install dependencies
working-directory: quatalog-scraping/transfer_scraper working-directory: transfer_scraper
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -r 'requirements.txt' pip install -r 'requirements.txt'
- name: Log IP - name: Run scraper
working-directory: transfer_scraper
run: | run: |
echo "Public IP: $(curl -s 'https://ipinfo.io/ip')" python3 main.py ${{ matrix.page }} transfer_${{ matrix.page }}.json
- name: Copy data to temp dir
run: |
mkdir new-data
dd status=progress if='data/transfer.json' of='new-data/transfer.json'
dd status=progress if='data/transfer_state.json' of='new-data/transfer_state.json'
- name: Scrape transfer guide
run: |
python3 quatalog-scraping/transfer_scraper/main.py new-data/transfer.json new-data/transfer_state.json ${{ github.event.inputs.timeout || env.DEFAULT_TIMEOUT }}
- name: Upload data to artifact - name: Upload data to artifact
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: transfer-data name: transfer-page-${{ matrix.page }}
path: new-data/ path: transfer_scraper/transfer_${{ matrix.page }}.json
push-new-data:
name: Push new data to data repo
runs-on: ubuntu-latest
needs: [scrape-data]
steps:
- name: Clone Quatalog data
uses: actions/checkout@v4
with:
repository: quatalog/data
path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact
uses: actions/download-artifact@v4
with:
name: transfer-data
path: data
- name: Copy data to repo directory
run: |
ls -lsa data
dd status=progress if='data/transfer.json' of='quatalog-data/transfer.json'
dd status=progress if='data/transfer_state.json' of='quatalog-data/transfer_state.json'
- name: Push new data
working-directory: quatalog-data
run: |
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
git add transfer.json transfer_state.json
git commit -m "$(date)" || exit 0
git push
re-run-scraper:
name: Tell Github to run this workflow again
runs-on: ubuntu-latest
needs: [push-new-data]
steps:
- name: Tell Github to run this workflow again
run: |
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: token ${{ secrets.PUSH_TOKEN }}" \
--request POST \
--data '{"event_type": "transfer-scraper"}' \
"https://api.github.com/repos/quatalog/quatalog/dispatches"

View file

@ -3,22 +3,17 @@ import html
import sys import sys
import re import re
import os.path import os.path
import traceback
from time import sleep from time import sleep
import random import random
from signal import alarm, SIGALRM, signal
from fake_useragent import UserAgent from fake_useragent import UserAgent
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException from selenium.common.exceptions import (
from selenium.common.exceptions import TimeoutException StaleElementReferenceException,
from selenium.common.exceptions import NoSuchElementException NoSuchElementException,
)
def raise_(ex):
raise ex
# Fix course titles accounting for Roman numerals up to X # Fix course titles accounting for Roman numerals up to X
@ -29,84 +24,36 @@ def normalize_title(input):
return s.strip() return s.strip()
# Waits until EC plus some random wait time
def wait(ec): def wait(ec):
global driver global driver
WebDriverWait( WebDriverWait(
driver, 20, ignored_exceptions=[StaleElementReferenceException] driver, 40, ignored_exceptions=[StaleElementReferenceException]
).until(ec) ).until(ec)
sleep(random.uniform(400, 1900) / 1000) sleep(random.uniform(400, 1900) / 1000)
def scrape_course_card(html_id, i, note): # jump_to_page: navigates to a paginated page on this insufferable website
#
# curr_page: the current page number
# to_page: the page number to jump to
# num_pages: the total number of pages
# postback_type: javascript:__doPostBack('<this field>','Page$3')
# pagination_type: <span id="<this field>">PAGE 1 OF 27<br></span>
def jump_to_page(curr_page, to_page, postback_type, pagination_type):
global driver global driver
trs = ( page = driver.find_element(By.ID, postback_type)
driver.find_element("id", html_id)
.find_elements(By.CSS_SELECTOR, ".course-detail")[i]
.find_elements(By.TAG_NAME, "tr")
)
course_name_and_id = trs[0].text.split()
course_desc = ""
if trs[1].find_element(By.TAG_NAME, "td").get_attribute("colspan") == "2":
course_desc = trs[1].text
course_department = normalize_title(
next((x for x in trs if x.text.strip().startswith("Department:")))
.find_elements(By.TAG_NAME, "td")[1]
.text
)
course_catalog = normalize_title(
next((x for x in trs if x.text.strip().startswith("Source catalog:")))
.find_elements(By.TAG_NAME, "td")[1]
.text
)
try: try:
k = 1 + next( num_pages = int(driver.find_element(By.ID, pagination_type).text.split()[-1])
i for i, v in enumerate(course_name_and_id) if bool(re.search(r"\d", v)) except NoSuchElementException:
)
course_id = " ".join(course_name_and_id[0:k])
course_name = normalize_title(" ".join(course_name_and_id[k:]))
except StopIteration: # Handling for Not Transferrable
course_id = course_name_and_id[0]
course_name = normalize_title(" ".join(course_name_and_id[1:]))
if not note:
try:
course_credits = (
next((x for x in trs if x.text.strip().startswith("Units:")))
.find_elements(By.TAG_NAME, "td")[1]
.text.strip()
)
except:
course_credits = ""
return {
"id": course_id,
"name": course_name,
"credits": course_credits,
"desc": course_desc,
"department": course_department,
"catalog": course_catalog,
}
else:
course_note = driver.find_element("id", "lblCommentsPublic").text.strip()
return {
"id": course_id,
"name": course_name,
"note": course_note,
"desc": course_desc,
"department": course_department,
"catalog": course_catalog,
}
def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
page = driver.find_element("id", postback_type)
if num_pages == 1:
return 1, page return 1, page
if to_page > num_pages or to_page < 1:
raise ValueError(
f"to_page was out of range ({to_page} not in [1, {num_pages}])"
)
while curr_page != to_page: while curr_page != to_page:
jumpable_pages = { jumpable_pages = {
int(x.get_attribute("href").split("'")[3][5:]): x int(x.get_attribute("href").split("'")[3][5:]): x
@ -117,7 +64,7 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
+ """','Page$"]""", + """','Page$"]""",
) )
} }
curr_page = int(driver.find_element("id", pagination_type).text.split()[-3]) curr_page = int(driver.find_element(By.ID, pagination_type).text.split()[-3])
if to_page in jumpable_pages: if to_page in jumpable_pages:
jumpable_pages[to_page].click() jumpable_pages[to_page].click()
curr_page = to_page curr_page = to_page
@ -127,251 +74,183 @@ def jump_to_page(curr_page, to_page, num_pages, postback_type, pagination_type):
else: else:
jumpable_pages[max(jumpable_pages)].click() jumpable_pages[max(jumpable_pages)].click()
curr_page = max(jumpable_pages) curr_page = max(jumpable_pages)
print(f"Jumping to {postback_type} page {curr_page}", file=sys.stderr)
wait(EC.staleness_of(page)) wait(EC.staleness_of(page))
sleep(random.uniform(400, 1900) / 1000) sleep(random.uniform(400, 1900) / 1000)
page = driver.find_element("id", postback_type) page = driver.find_element(By.ID, postback_type)
return curr_page, page return curr_page, page
def main(): # scrape_page: Scrapes a page of institutions
#
# page_num: The page to scrape.
# Note that the current page before running this function must be 1.
def scrape_page(page_num):
global driver global driver
global options
if len(sys.argv) != 3 and len(sys.argv) != 4:
print(
f"USAGE: python {sys.argv[0]} <transfer file> <state file> [timeout minutes]"
)
exit(1)
transfer_json_path = sys.argv[1]
state_json_path = sys.argv[2]
timeout_seconds = int(sys.argv[3] if len(sys.argv) == 4 else 120) * 60
# Set up timeout so that the GH action does not run forever, pretend it's ^C
print(f"Setting timeout to {timeout_seconds} seconds", file=sys.stderr)
signal(SIGALRM, lambda a, b: raise_(KeyboardInterrupt))
alarm(timeout_seconds)
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
user_agent = UserAgent().random
options.set_preference("general.useragent.override", user_agent)
# options.set_preference("network.proxy.socks", "")
# options.set_preference("network.proxy.socks_port", )
# options.set_preference("network.proxy.socks_remote_dns", True)
# options.set_preference("network.proxy.type", 1)
print(f"Using randomized user agent {user_agent}", file=sys.stderr)
driver = webdriver.Firefox(options=options) driver = webdriver.Firefox(options=options)
driver.get( driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce" "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
) )
jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
print( num_institutions = len(
f'Title is {driver.find_element(By.TAG_NAME, "title").get_attribute("innerText").strip()}', driver.find_elements(
file=sys.stderr, By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)
) )
driver.quit()
num_pages = int( print(f"Scraping page {page_num}, found {num_institutions} links", file=sys.stderr)
driver.find_element("id", "lblInstWithEQPaginationInfo").text.split()[-1] return [scrape_institution(i, page_num) for i in range(0, num_institutions)]
# scrape_institution: Scrapes an institution by index.
#
# index: the 0-indexed index of the instituion to scrape on the page we are on.
def scrape_institution(index, page_num):
global driver
global options
driver = webdriver.Firefox(options=options)
driver.get(
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
) )
print(f"{num_pages} pages detected", file=sys.stderr) jump_to_page(1, page_num, "gdvInstWithEQ", "lblInstWithEQPaginationInfo")
state = {"inst_pg": 1, "inst_idx": 0, "course_pg": 1, "course_idx": 0} inst_link = driver.find_element(
institutions = {} By.ID, f"gdvInstWithEQ_btnCreditFromInstName_{index}"
if os.path.isfile(state_json_path): )
with open(state_json_path, "r") as statejson: [inst_name, inst_city, inst_state, _] = [
state = json.load(statejson) e.text
if os.path.isfile(transfer_json_path): for e in inst_link.find_element(By.XPATH, "../..").find_elements(
with open(transfer_json_path, "r") as transferjson: By.TAG_NAME, "td"
institutions = json.load(transferjson) )
]
inst_name, inst_city = normalize_title(inst_name), normalize_title(inst_city)
inst_link.click()
wait(EC.staleness_of(inst_link))
print(f"Scraping {inst_name} ({inst_city}, {inst_state})", file=sys.stderr)
print("Loaded state: ", end="", file=sys.stderr) # Add all courses
json.dump(state, sys.stderr, indent=4) try:
print("", file=sys.stderr) num_pages = int(
driver.find_element(By.ID, "lblCourseEQPaginationInfo").text.split()[-1]
if state["inst_pg"] > num_pages: )
raise Exception except NoSuchElementException:
num_pages = 1
try: try:
curr_inst_page = 1 for i in range(1, num_pages + 1):
while state["inst_pg"] <= num_pages: jump_to_page(max(1, i - 1), i, "gdvCourseEQ", "lblCourseEQPaginationInfo")
curr_inst_page, page = jump_to_page( driver.find_element(By.ID, "gdvCourseEQ_cbxHeaderCheckAll").click()
curr_inst_page, except NoSuchElementException:
state["inst_pg"], # Institution has no data
num_pages, return {
"gdvInstWithEQ", "institution": inst_name,
"lblInstWithEQPaginationInfo", "city": inst_city,
) "state": inst_state,
"courses": [],
}
inst_list_len = len( # Open list
page.find_elements( driver.find_element(By.ID, "btnAddToMyEQList").click()
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]" wait(EC.visibility_of_element_located((By.ID, "gdvMyCourseEQList")))
)
)
while state["inst_idx"] < inst_list_len: # Scrape list
institution_link = driver.find_element( tds = driver.find_element(By.ID, "gdvMyCourseEQList").find_elements(
"id", "gdvInstWithEQ" By.TAG_NAME, "td"
).find_elements( )
By.CSS_SELECTOR, "a[id^=gdvInstWithEQ_btnCreditFromInstName_]"
)[
state["inst_idx"]
]
fields = institution_link.find_element(By.XPATH, "../..").find_elements(
By.CSS_SELECTOR, ".gdv_boundfield_uppercase"
)
inst_name = normalize_title(institution_link.text)
city = normalize_title(fields[0].text)
us_state = fields[1].text.strip()
institution_link.click() transfer_courses = [
wait(EC.staleness_of(institution_link)) {
"transfer": parse_course_td(transfer_course),
"rpi": parse_course_td(rpi_course, note.text.strip()),
"begin": begin.text.strip(),
"end": end.text.strip(),
}
for transfer_course, rpi_course, note, begin, end, _ in zip(
*[iter(x for x in tds)] * 6
)
]
try: driver.quit()
course_pages_len = int(
driver.find_element(
"id", "lblCourseEQPaginationInfo"
).text.split()[-1]
)
except NoSuchElementException:
course_pages_len = 1
try: return {
courses = institutions[inst_name]["courses"] "institution": inst_name,
except Exception: "city": inst_city,
courses = [] "state": inst_state,
"courses": transfer_courses,
}
curr_course_page = 1
while state["course_pg"] <= course_pages_len:
curr_course_page, page = jump_to_page(
curr_course_page,
state["course_pg"],
course_pages_len,
"gdvCourseEQ",
"lblCourseEQPaginationInfo",
)
course_links_len = len( def parse_course_td(td, note=None):
page.find_elements( course_info = (
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" html.unescape(td.get_attribute("innerHTML")).strip().split("<br>")[0].split()
) )
)
while state["course_idx"] < course_links_len: # Not all schools use the same course code format, so this figures out how long
course_link = driver.find_element( # it is if it exists, it will not exist for Not Transferrable.
"id", "gdvCourseEQ" try:
).find_elements( course_id_delim = 1 + list(
By.CSS_SELECTOR, "a[id^=gdvCourseEQ_btnViewCourseEQDetail_]" bool(re.search(r"\d", s)) for s in course_info
)[ ).index(True)
state["course_idx"] except ValueError:
] course_id_delim = 1
course_link.click()
try: # Same deal with credit counts.
wait( try:
EC.element_to_be_clickable( cr_delim = (
(By.CSS_SELECTOR, ".modal-header button") len(course_info)
), - 1
) - list(bool(re.search(r"\(", s)) for s in course_info[::-1]).index(True)
)
except ValueError:
cr_delim = len(course_info)
transfer = [ # note serves as a credit count override, since the RPI-side credit counts
scrape_course_card("lblSendCourseEQDetail", i, False) # are inaccurate
for i in range( out = {
0, "id": " ".join(course_info[:course_id_delim]),
len( "name": normalize_title(" ".join(course_info[course_id_delim:cr_delim])),
driver.find_element( "catalog": td.find_element(By.TAG_NAME, "span").text,
"id", "lblSendCourseEQDetail" }
).find_elements( if note is None:
By.CSS_SELECTOR, ".course-detail" out.update({"credits": str(" ".join(course_info[cr_delim:])[1:-1])}),
) return out
), else:
) out.update({"note": note})
] return out
rpi = [
scrape_course_card("lblReceiveCourseEQDetail", i, True)
for i in range(
0,
len(
driver.find_element(
"id", "lblReceiveCourseEQDetail"
).find_elements(
By.CSS_SELECTOR, ".course-detail"
)
),
)
]
print( def main():
f"{inst_name} ({state['inst_idx']}:{state['inst_pg']}/{num_pages}): {transfer[0]['id']} {transfer[0]['name']} -> {rpi[0]['id']} {rpi[0]['name']} ({state['course_idx']}:{state['course_pg']}/{course_pages_len})", global driver
file=sys.stderr, global options
)
begin_date = driver.find_element( if len(sys.argv) != 3:
"id", "lblBeginEffectiveDate" print(
).text f"USAGE: python {sys.argv[0]} <page number to scrape> <output file>",
end_date = driver.find_element( file=sys.stderr,
"id", "lblEndEffectiveDate" )
).text return 1
driver.find_element( PAGE_NUM_TO_SCRAPE = int(sys.argv[1])
By.CSS_SELECTOR, ".modal-header button" OUT_FILENAME = sys.argv[2]
).click()
courses += [ print(f"Setting up selenium Firefox emulator", file=sys.stderr)
{ options = webdriver.FirefoxOptions()
"transfer": transfer, options.add_argument("--headless")
"rpi": rpi,
"begin": begin_date,
"end": end_date,
}
]
state["course_idx"] += 1
except (Exception, KeyboardInterrupt) as e:
institutions.update(
{
inst_name: {
"city": city,
"state": us_state,
"courses": courses,
}
}
)
raise e
state["course_idx"] = 0
state["course_pg"] += 1
institutions.update( user_agent = UserAgent().random
{inst_name: {"city": city, "state": us_state, "courses": courses}} options.set_preference("general.useragent.override", user_agent)
) print(f"Using randomized user agent {user_agent}", file=sys.stderr)
state["course_pg"] = 1
state["inst_idx"] += 1
driver.find_element("id", "btnSwitchView").click() with open(OUT_FILENAME, "w") as transferjson:
wait( json.dump(scrape_page(PAGE_NUM_TO_SCRAPE), transferjson, indent=4)
EC.text_to_be_present_in_element(
("id", "lblInstWithEQPaginationInfo"), str(state["inst_pg"])
),
)
state["inst_idx"] = 0
state["inst_pg"] += 1
except (Exception, KeyboardInterrupt) as e:
print("Program hits exception and will save and terminate", file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
print("Program will terminate with state: ", end="", file=sys.stderr)
json.dump(state, sys.stderr, indent=4)
print("", file=sys.stderr)
with open(transfer_json_path, "w") as transferjson:
json.dump(institutions, transferjson, indent=4)
with open(state_json_path, "w") as statejson:
json.dump(state, statejson, indent=4)
driver.quit() driver.quit()
if __name__ == "__main__": if __name__ == "__main__":
main() exit(main())