quatalog-scraper/.github/workflows/scraper.yml

212 lines
6.5 KiB
YAML
Raw Normal View History

2023-02-10 05:41:24 +00:00
name: Update data, generate static site, and deploy to Pages
2023-02-07 15:17:57 +00:00
on:
workflow_dispatch:
schedule:
2023-03-14 04:02:59 +00:00
- cron: '15 * * * *'
2023-02-07 15:17:57 +00:00
jobs:
scrape-data:
2023-02-07 17:38:41 +00:00
name: Scrape data
2023-02-07 15:17:57 +00:00
runs-on: ubuntu-latest
steps:
- name: Checkout scraping repo
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-02-07 15:23:05 +00:00
with:
path: quatalog-scraping
2023-02-07 15:17:57 +00:00
- name: Clone QuACS data
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-02-07 15:17:57 +00:00
with:
repository: quacs/quacs-data
path: quacs-data
- name: Run scraper
run: |
# Usage: CourseOfferingsScraper <data_directory> <terms_offered_file> <prerequisites_file> <list_of_terms_file>
2023-02-10 03:59:40 +00:00
mkdir new-data
2023-02-07 15:27:38 +00:00
quatalog-scraping/bin/CourseOfferingsScraper \
quacs-data/semester_data \
2023-02-10 03:59:40 +00:00
new-data/terms_offered.json \
new-data/prerequisites.json \
new-data/terms_list.json
2023-02-07 15:17:57 +00:00
2023-02-07 15:31:24 +00:00
- name: Copy catalog.json from QuACS data
2023-02-07 15:17:57 +00:00
run: |
2023-02-10 03:59:40 +00:00
CURRENT_TERM="$(grep new-data/terms_list.json -e 'current_term' | cut -f4 -d '"')"
rsync -avz "quacs-data/semester_data/$CURRENT_TERM/catalog.json" new-data/catalog.json
2023-02-10 03:12:25 +00:00
2023-02-10 03:06:25 +00:00
- name: Upload data to artifact
2024-03-01 03:06:23 +00:00
uses: actions/upload-artifact@v4
2023-02-10 03:06:25 +00:00
with:
2023-02-10 03:59:40 +00:00
name: new-data
path: new-data/
2023-02-10 03:06:25 +00:00
push-new-data:
2023-02-10 03:08:32 +00:00
name: Push new data to data repo
2023-02-10 03:06:25 +00:00
runs-on: ubuntu-latest
needs: [scrape-data]
steps:
2023-02-10 03:33:35 +00:00
- name: Clone Quatalog data
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-02-10 03:33:35 +00:00
with:
repository: quatalog/data
path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }}
2023-02-10 03:06:25 +00:00
- name: Download data from artifact
2024-03-01 03:06:23 +00:00
uses: actions/download-artifact@v4
2023-02-10 03:06:25 +00:00
with:
2023-02-10 03:59:40 +00:00
name: new-data
path: new-data
2023-02-10 03:33:35 +00:00
- name: Copy data to repo directory
run: |
2023-02-10 03:59:40 +00:00
rsync -avz new-data/ quatalog-data/
2023-02-10 03:06:25 +00:00
2023-02-07 15:17:57 +00:00
- name: Push new data
2023-02-10 03:59:40 +00:00
working-directory: quatalog-data
2023-02-07 15:17:57 +00:00
run: |
2023-07-28 03:54:26 +00:00
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
2023-02-07 15:17:57 +00:00
git add terms_offered.json prerequisites.json terms_list.json catalog.json
git commit -m "$(date)" || exit 0
git push
2023-02-10 03:59:40 +00:00
generate-site:
name: Generate the static site
runs-on: ubuntu-latest
needs: [scrape-data]
steps:
- name: Checkout scraping repo
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-02-10 03:59:40 +00:00
with:
path: quatalog-scraping
- name: Clone Quatalog static site
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-02-10 03:59:40 +00:00
with:
repository: quatalog/site
ref: static-generated
path: quatalog-site
token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact
2024-03-01 03:06:23 +00:00
uses: actions/download-artifact@v4
2023-02-10 03:59:40 +00:00
with:
name: new-data
path: new-data
- name: Run the HTML generator
run: |
# Usage: GenerateHtml <terms_offered_file> <prerequisites_file> <list_of_terms_file> <catalog_file> <out_directory> <searchable_catalog_file> <courses_list_file>
2023-02-10 04:01:27 +00:00
quatalog-scraping/bin/GenerateHtml \
2023-02-10 03:59:40 +00:00
new-data/terms_offered.json \
new-data/prerequisites.json \
new-data/terms_list.json \
new-data/catalog.json \
2023-02-10 22:25:07 +00:00
courses/ \
searchable_catalog.json \
courses_list.json
2023-02-10 03:59:40 +00:00
- name: Merge data
run: |
mkdir -p quatalog-site/courses quatalog-site/json
2023-02-10 03:59:40 +00:00
rsync -avz courses/ quatalog-site/courses/
2023-02-17 02:20:04 +00:00
rsync -avz searchable_catalog.json quatalog-site/json/searchable_catalog.json
rsync -avz courses_list.json quatalog-site/json/courses_list.json
2023-02-10 03:59:40 +00:00
- name: Push generated HTML
working-directory: quatalog-site
run: |
2023-02-17 02:20:04 +00:00
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
git add courses json
2023-02-10 03:59:40 +00:00
git commit -m "$(date)" || exit 0
git push
2023-02-10 05:33:40 +00:00
2023-07-28 03:54:26 +00:00
push-csv:
name: Push CSV file to data repo
runs-on: ubuntu-latest
needs: [generate-site]
steps:
- name: Checkout site repo/static-generated branch
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-07-28 03:54:26 +00:00
with:
repository: quatalog/site
ref: static-generated
path: static-generated
- name: Checkout data repo
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-07-28 03:54:26 +00:00
with:
repository: quatalog/data
path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }}
- name: Create CSV file
run: |
2023-07-29 21:17:48 +00:00
echo '"Instructor","Course"' > quatalog-data/courses.csv
2023-07-28 03:58:34 +00:00
cd static-generated/courses
grep -oe '<li>.*</li>' *.html |
2023-07-28 03:54:26 +00:00
sort -u |
2023-07-29 21:10:27 +00:00
grep -ve "-[0-9]9[4-7][0-9]" -e "-[0-9]9[89]0" -e "-[0-9]00[1-9]" -e "USA[RF]" -e "ADMN" -e "USNA" |
2023-07-28 03:54:26 +00:00
sed -e 's/^/"/' -e 's/.html:<li>/","/' -e 's#</li>#"#' -e 's/"\([^"]*\)","\([^"]*\)"/"\2","\1"/' |
2023-07-29 21:17:48 +00:00
sort >> ../../quatalog-data/courses.csv
2023-07-28 03:54:26 +00:00
- name: Push CSV file
working-directory: quatalog-data
run: |
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
git add courses.csv
git commit -m "$(date)" || exit 0
git push
prepare-site:
2023-02-10 23:16:58 +00:00
name: Prepare static site
2023-02-10 05:33:40 +00:00
runs-on: ubuntu-latest
needs: [generate-site]
steps:
- name: Checkout static-generated branch
2024-03-01 03:06:23 +00:00
uses: actions/checkout@v4
2023-02-10 05:33:40 +00:00
with:
repository: quatalog/site
ref: static-generated
- name: Setup Pages
2024-03-01 03:06:23 +00:00
uses: actions/configure-pages@v4
2023-02-10 05:33:40 +00:00
- name: Archive github-pages artifact
run: |
tar \
--dereference --hard-dereference \
--exclude=.git \
--exclude=.github \
--exclude=LICENSE \
--exclude=README.md \
2023-02-10 23:35:48 +00:00
-cf "$RUNNER_TEMP/artifact.tar" .
2023-02-10 05:33:40 +00:00
- name: Upload github-pages artifact
2024-03-01 03:06:23 +00:00
uses: actions/upload-artifact@v4
with:
name: github-pages
path: ${{ runner.temp }}/artifact.tar
deploy-site:
2023-02-10 23:16:58 +00:00
name: Deploy to GitHub Pages
runs-on: ubuntu-latest
needs: [prepare-site]
permissions:
pages: write
id-token: write
concurrency:
group: "pages"
cancel-in-progress: true
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
2023-02-10 05:33:40 +00:00
- name: Deploy to GitHub Pages
id: deployment
2024-03-01 03:06:23 +00:00
uses: actions/deploy-pages@v4