mirror of
https://github.com/quatalog/quatalog.git
synced 2024-09-30 05:29:17 +00:00
Compare commits
5 commits
aec272e28e
...
9c374bf130
Author | SHA1 | Date | |
---|---|---|---|
9c374bf130 | |||
10360ff57c | |||
3b239f9cef | |||
de89a56808 | |||
45cfba68ac |
2
.github/workflows/transfer.yml
vendored
2
.github/workflows/transfer.yml
vendored
|
@ -75,7 +75,7 @@ jobs:
|
||||||
|
|
||||||
- name: Combine JSONs
|
- name: Combine JSONs
|
||||||
run: |
|
run: |
|
||||||
cat new-data/* | jq -s 'add' > data/transfer.json
|
cat new-data/* | jq -s 'add | sort_by(.institution)' > data/transfer.json
|
||||||
|
|
||||||
- name: Commit data
|
- name: Commit data
|
||||||
working-directory: data
|
working-directory: data
|
||||||
|
|
|
@ -29,7 +29,7 @@ def wait(ec):
|
||||||
global driver
|
global driver
|
||||||
|
|
||||||
WebDriverWait(
|
WebDriverWait(
|
||||||
driver, 40, ignored_exceptions=[StaleElementReferenceException]
|
driver, 35, ignored_exceptions=[StaleElementReferenceException]
|
||||||
).until(ec)
|
).until(ec)
|
||||||
sleep(random.uniform(400, 1900) / 1000)
|
sleep(random.uniform(400, 1900) / 1000)
|
||||||
|
|
||||||
|
@ -44,6 +44,7 @@ def wait(ec):
|
||||||
def jump_to_page(curr_page, to_page, postback_type, pagination_type):
|
def jump_to_page(curr_page, to_page, postback_type, pagination_type):
|
||||||
global driver
|
global driver
|
||||||
|
|
||||||
|
wait(EC.visibility_of_element_located((By.ID, postback_type)))
|
||||||
page = driver.find_element(By.ID, postback_type)
|
page = driver.find_element(By.ID, postback_type)
|
||||||
try:
|
try:
|
||||||
num_pages = int(driver.find_element(By.ID, pagination_type).text.split()[-1])
|
num_pages = int(driver.find_element(By.ID, pagination_type).text.split()[-1])
|
||||||
|
@ -100,7 +101,7 @@ def scrape_page(page_num):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
driver.quit()
|
driver.quit()
|
||||||
print(
|
print(
|
||||||
f"Attempt {i} failed to load page, retrying in 25 seconds...",
|
f"Attempt {i} failed due to {type(e).__name__}, retrying in 25 seconds...",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
sleep(25)
|
sleep(25)
|
||||||
|
@ -125,7 +126,7 @@ def scrape_institution_safe(index, page_num):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
driver.quit()
|
driver.quit()
|
||||||
print(
|
print(
|
||||||
f"\tAttempt {i} failed due to {type(e).__name__}: {e}, retrying in 25 seconds...",
|
f"\tAttempt {i} failed due to {type(e).__name__}, retrying in 25 seconds...",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
sleep(25)
|
sleep(25)
|
||||||
|
|
Loading…
Reference in a new issue