mirror of
https://github.com/quatalog/quatalog.git
synced 2024-11-17 20:42:45 +00:00
Compare commits
3 commits
b017436be9
...
517952f977
Author | SHA1 | Date | |
---|---|---|---|
517952f977 | |||
f1a47dca48 | |||
af25410c5d |
|
@ -93,6 +93,11 @@ def scrape_page(page_num):
|
||||||
for i in range(1, 4):
|
for i in range(1, 4):
|
||||||
try:
|
try:
|
||||||
driver = webdriver.Firefox(options=options)
|
driver = webdriver.Firefox(options=options)
|
||||||
|
driver.get("https://ipinfo.io/ip")
|
||||||
|
print(
|
||||||
|
f'Trying with IP {driver.find_element(By.TAG_NAME, "body").text}',
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
driver.get(
|
driver.get(
|
||||||
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
"https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=f080a477-bff8-46df-a5b2-25e9affdd4ed&aid=27b576bb-cd07-4e57-84d0-37475fde70ce"
|
||||||
)
|
)
|
||||||
|
@ -102,13 +107,14 @@ def scrape_page(page_num):
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"Attempt {i} failed due to {type(e).__name__}, retrying in 25 seconds...",
|
f"Attempt {i} failed due to {type(e).__name__}, retrying in 25 seconds...",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
sleep(25)
|
sleep(25)
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Failed to load the main page after 15 attempts, aborting.")
|
raise Exception(f"Failed to load the main page after 4 attempts, aborting.")
|
||||||
|
|
||||||
num_institutions = len(
|
num_institutions = len(
|
||||||
driver.find_elements(
|
driver.find_elements(
|
||||||
|
@ -133,7 +139,7 @@ def scrape_institution_safe(index, page_num):
|
||||||
)
|
)
|
||||||
sleep(25)
|
sleep(25)
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Failed to scrape {index} after 15 attempts, aborting.")
|
raise Exception(f"Failed to scrape {index} after 4 attempts, aborting.")
|
||||||
|
|
||||||
|
|
||||||
# scrape_institution: Scrapes an institution by index.
|
# scrape_institution: Scrapes an institution by index.
|
||||||
|
@ -181,7 +187,7 @@ def scrape_institution(index, page_num):
|
||||||
"institution": inst_name,
|
"institution": inst_name,
|
||||||
"city": inst_city,
|
"city": inst_city,
|
||||||
"state": inst_state,
|
"state": inst_state,
|
||||||
"courses": [],
|
"transfers": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Open list
|
# Open list
|
||||||
|
@ -212,7 +218,7 @@ def scrape_institution(index, page_num):
|
||||||
"institution": inst_name,
|
"institution": inst_name,
|
||||||
"city": inst_city,
|
"city": inst_city,
|
||||||
"state": inst_state,
|
"state": inst_state,
|
||||||
"courses": transfer_courses,
|
"transfers": transfer_courses,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -231,7 +237,10 @@ def parse_course_td(td, include_credits):
|
||||||
for x in td_text[: len(td_text) - 3]
|
for x in td_text[: len(td_text) - 3]
|
||||||
]
|
]
|
||||||
|
|
||||||
return [parse_one_course(x, include_credits) for x in courses_info]
|
return {
|
||||||
|
"catalog": td.find_element(By.TAG_NAME, "span").text.strip(),
|
||||||
|
"courses": [parse_one_course(x, include_credits) for x in courses_info],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_one_course(course_info, include_credits):
|
def parse_one_course(course_info, include_credits):
|
||||||
|
|
Loading…
Reference in a new issue