quatalog-scraper/.github/workflows/scraper.yml

212 lines
6.5 KiB
YAML

name: Update data, generate static site, and deploy to Pages
on:
workflow_dispatch:
schedule:
- cron: '15 * * * *'
jobs:
scrape-data:
name: Scrape data
runs-on: ubuntu-latest
steps:
- name: Checkout scraping repo
uses: actions/checkout@v4
with:
path: quatalog-scraping
- name: Clone QuACS data
uses: actions/checkout@v4
with:
repository: quacs/quacs-data
path: quacs-data
- name: Run scraper
run: |
# Usage: CourseOfferingsScraper <data_directory> <terms_offered_file> <prerequisites_file> <list_of_terms_file>
mkdir new-data
quatalog-scraping/bin/CourseOfferingsScraper \
quacs-data/semester_data \
new-data/terms_offered.json \
new-data/prerequisites.json \
new-data/terms_list.json
- name: Copy catalog.json from QuACS data
run: |
CURRENT_TERM="$(grep new-data/terms_list.json -e 'current_term' | cut -f4 -d '"')"
rsync -avz "quacs-data/semester_data/$CURRENT_TERM/catalog.json" new-data/catalog.json
- name: Upload data to artifact
uses: actions/upload-artifact@v4
with:
name: new-data
path: new-data/
push-new-data:
name: Push new data to data repo
runs-on: ubuntu-latest
needs: [scrape-data]
steps:
- name: Clone Quatalog data
uses: actions/checkout@v4
with:
repository: quatalog/data
path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact
uses: actions/download-artifact@v4
with:
name: new-data
path: new-data
- name: Copy data to repo directory
run: |
rsync -avz new-data/ quatalog-data/
- name: Push new data
working-directory: quatalog-data
run: |
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
git add terms_offered.json prerequisites.json terms_list.json catalog.json
git commit -m "$(date)" || exit 0
git push
generate-site:
name: Generate the static site
runs-on: ubuntu-latest
needs: [scrape-data]
steps:
- name: Checkout scraping repo
uses: actions/checkout@v4
with:
path: quatalog-scraping
- name: Clone Quatalog static site
uses: actions/checkout@v4
with:
repository: quatalog/site
ref: static-generated
path: quatalog-site
token: ${{ secrets.PUSH_TOKEN }}
- name: Download data from artifact
uses: actions/download-artifact@v4
with:
name: new-data
path: new-data
- name: Run the HTML generator
run: |
# Usage: GenerateHtml <terms_offered_file> <prerequisites_file> <list_of_terms_file> <catalog_file> <out_directory> <searchable_catalog_file> <courses_list_file>
quatalog-scraping/bin/GenerateHtml \
new-data/terms_offered.json \
new-data/prerequisites.json \
new-data/terms_list.json \
new-data/catalog.json \
courses/ \
searchable_catalog.json \
courses_list.json
- name: Merge data
run: |
mkdir -p quatalog-site/courses quatalog-site/json
rsync -avz courses/ quatalog-site/courses/
rsync -avz searchable_catalog.json quatalog-site/json/searchable_catalog.json
rsync -avz courses_list.json quatalog-site/json/courses_list.json
- name: Push generated HTML
working-directory: quatalog-site
run: |
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
git add courses json
git commit -m "$(date)" || exit 0
git push
push-csv:
name: Push CSV file to data repo
runs-on: ubuntu-latest
needs: [generate-site]
steps:
- name: Checkout site repo/static-generated branch
uses: actions/checkout@v4
with:
repository: quatalog/site
ref: static-generated
path: static-generated
- name: Checkout data repo
uses: actions/checkout@v4
with:
repository: quatalog/data
path: quatalog-data
token: ${{ secrets.PUSH_TOKEN }}
- name: Create CSV file
run: |
echo '"Instructor","Course"' > quatalog-data/courses.csv
cd static-generated/courses
grep -oe '<li>.*</li>' *.html |
sort -u |
grep -ve "-[0-9]9[4-7][0-9]" -e "-[0-9]9[89]0" -e "-[0-9]00[1-9]" -e "USA[RF]" -e "ADMN" -e "USNA" |
sed -e 's/^/"/' -e 's/.html:<li>/","/' -e 's#</li>#"#' -e 's/"\([^"]*\)","\([^"]*\)"/"\2","\1"/' |
sort >> ../../quatalog-data/courses.csv
- name: Push CSV file
working-directory: quatalog-data
run: |
git config user.name "Quatalog Updater"
git config user.email "github_actions@quatalog.com"
git add courses.csv
git commit -m "$(date)" || exit 0
git push
prepare-site:
name: Prepare static site
runs-on: ubuntu-latest
needs: [generate-site]
steps:
- name: Checkout static-generated branch
uses: actions/checkout@v4
with:
repository: quatalog/site
ref: static-generated
- name: Setup Pages
uses: actions/configure-pages@v4
- name: Archive github-pages artifact
run: |
tar \
--dereference --hard-dereference \
--exclude=.git \
--exclude=.github \
--exclude=LICENSE \
--exclude=README.md \
-cf "$RUNNER_TEMP/artifact.tar" .
- name: Upload github-pages artifact
uses: actions/upload-artifact@v4
with:
name: github-pages
path: ${{ runner.temp }}/artifact.tar
deploy-site:
name: Deploy to GitHub Pages
runs-on: ubuntu-latest
needs: [prepare-site]
permissions:
pages: write
id-token: write
concurrency:
group: "pages"
cancel-in-progress: true
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4