quatalog-scraper/courseinfo_scraper/CourseOfferingsScraper.cpp

377 lines
15 KiB
C++
Raw Normal View History

2023-02-04 17:38:37 +00:00
#include<set>
#include<regex>
#include<fstream>
#include<iostream>
#include<filesystem>
2023-02-05 00:32:25 +00:00
#include<unordered_set>
2023-02-17 02:49:17 +00:00
#include<unordered_map>
2023-02-06 21:37:06 +00:00
#include<json/json.h>
2023-02-10 01:56:18 +00:00
namespace fs = std::filesystem;
2023-02-06 21:37:06 +00:00
2023-02-07 23:10:56 +00:00
struct term_data_t {
Json::Value courses;
Json::Value prerequisites;
};
2023-02-05 02:09:05 +00:00
struct quatalog_data_t {
Json::Value terms_offered;
Json::Value prerequisites;
2023-02-07 14:42:37 +00:00
Json::Value list_of_terms;
2023-02-05 02:09:05 +00:00
};
2023-02-17 02:49:17 +00:00
const std::unordered_map<std::string,std::string> attr_to_short_attr {
{ "Communication Intensive", "[CI]" },
{ "Writing Intensive", "[WI]" },
{ "HASS Inquiry", "[HInq]" },
{ "Culminating Exp/Capstone", "[CulmExp]" },
{ "PDII Option for Engr Majors", "[PDII]" }
};
2023-02-05 02:09:05 +00:00
using course_handler_t = void(const Json::Value&,const std::string&,quatalog_data_t&,const Json::Value&);
void handle_term_dirs(const std::set<fs::directory_entry>&,quatalog_data_t&);
void handle_term(const fs::directory_entry& term_entry,quatalog_data_t&);
2023-02-05 03:06:28 +00:00
void handle_prefix(const Json::Value&,const std::string&,quatalog_data_t&,const term_data_t&,course_handler_t*);
2023-02-05 02:09:05 +00:00
void handle_course(const Json::Value&,const std::string&,quatalog_data_t&,const Json::Value&);
void handle_course_summer(const Json::Value&,const std::string&,quatalog_data_t&,const Json::Value&);
2023-02-07 23:10:56 +00:00
void handle_everything(const Json::Value&,const Json::Value&,const std::string&,Json::Value& course_term,Json::Value&,const Json::Value&);
2023-02-05 02:11:08 +00:00
void handle_sections(const Json::Value&,Json::Value&);
void handle_instructors(const Json::Value&,std::unordered_set<std::string>&);
void handle_multiple_instructors(const std::string&,std::unordered_set<std::string>&);
void handle_attributes(const Json::Value&,const std::string&,Json::Value&,Json::Value&);
void handle_term_attribute(const std::string&,Json::Value&);
void handle_attribute(const std::string&,Json::Value&,Json::Value&);
template<typename Functor> void iterate_on_delimited_string(const std::string&,const std::regex&,const Functor&);
2023-02-05 03:06:28 +00:00
void handle_prereqs(const Json::Value&,const std::string&,Json::Value&,const Json::Value&);
2023-02-04 17:38:37 +00:00
int main(const int argc,
const char** argv) {
2023-02-07 14:43:43 +00:00
if(argc != 5) {
std::cerr << "Bad number of arguments (" << argc << ")" << std::endl;
2023-02-05 04:50:51 +00:00
std::cerr << "Usage: " << argv[0]
<< " <data_directory>"
<< " <terms_offered_file>"
<< " <prerequisites_file>"
2023-02-07 14:42:37 +00:00
<< " <list_of_terms_file>"
<< std::endl;
2023-02-04 17:38:37 +00:00
return EXIT_FAILURE;
}
2023-02-05 02:09:05 +00:00
const auto& data_dir_path = fs::path(argv[1]);
2023-02-06 19:54:54 +00:00
const auto& terms_offered_filename = std::string(argv[2]);
const auto& prerequisites_filename = std::string(argv[3]);
2023-02-07 14:42:37 +00:00
const auto& list_of_terms_filename = std::string(argv[4]);
2023-02-05 02:09:05 +00:00
2023-02-04 17:38:37 +00:00
if(!fs::is_directory(data_dir_path)) {
2023-02-06 19:54:54 +00:00
std::cerr << "Data directory argument "
<< data_dir_path
<< " is not a directory" << std::endl;
2023-02-04 17:38:37 +00:00
return EXIT_FAILURE;
}
2023-02-04 22:29:50 +00:00
// Sort term dirs chronologically using a std::set
2023-02-04 17:38:37 +00:00
std::set<fs::directory_entry> term_dirs;
2023-02-06 19:54:54 +00:00
const auto& data_dir = fs::directory_iterator(data_dir_path);
2023-02-04 22:29:50 +00:00
for(const auto& term : data_dir) {
2023-02-04 17:38:37 +00:00
term_dirs.insert(term);
}
2023-02-04 22:29:50 +00:00
// Begin JSON manipulation
2023-02-05 02:09:05 +00:00
quatalog_data_t data;
2023-02-09 19:55:35 +00:00
// TODO: Once change to QuACS that accounts for prerelease data
// is merged, change this
data.list_of_terms["oldest_term"] = term_dirs.begin()->path().stem().string();
data.list_of_terms["current_term"] = term_dirs.rbegin()->path().stem().string();
2023-02-05 04:50:51 +00:00
handle_term_dirs(term_dirs,data);
2023-02-04 22:29:50 +00:00
2023-02-07 13:56:59 +00:00
Json::StreamWriterBuilder swb;
swb["indentation"] = " ";
std::unique_ptr<Json::StreamWriter> outWriter(swb.newStreamWriter());
2023-02-07 14:42:37 +00:00
std::fstream terms_offered_file{terms_offered_filename,std::ios::out};
std::fstream prerequisites_file{prerequisites_filename,std::ios::out};
std::fstream list_of_terms_file{list_of_terms_filename,std::ios::out};
2023-02-07 13:56:59 +00:00
outWriter->write(data.terms_offered,&terms_offered_file);
outWriter->write(data.prerequisites,&prerequisites_file);
2023-02-07 14:42:37 +00:00
outWriter->write(data.list_of_terms,&list_of_terms_file);
2023-02-05 04:50:51 +00:00
2023-02-05 02:09:05 +00:00
terms_offered_file.close();
2023-02-05 03:06:28 +00:00
prerequisites_file.close();
2023-02-07 14:42:37 +00:00
list_of_terms_file.close();
2023-02-04 22:29:50 +00:00
return EXIT_SUCCESS;
}
void handle_term_dirs(const std::set<fs::directory_entry>& term_dirs,
2023-02-05 02:09:05 +00:00
quatalog_data_t& data) {
2023-02-06 19:54:54 +00:00
for(const auto& term : term_dirs) {
if(!fs::is_directory(term)) continue;
2023-02-05 02:09:05 +00:00
handle_term(term,data);
2023-02-04 17:38:37 +00:00
}
}
void handle_term(const fs::directory_entry& term_entry,
quatalog_data_t& quatalog_data) {
2023-02-04 17:38:37 +00:00
const fs::path dir = term_entry.path();
2023-02-06 19:54:54 +00:00
const auto& dirname = dir.string();
const auto& term = dir.stem().string();
const auto& courses_filename = dirname + "/courses.json";
const auto& prereqs_filename = dirname + "/prerequisites.json";
2023-02-04 17:38:37 +00:00
std::fstream courses_file{courses_filename,std::ios::in};
std::fstream prereqs_file{prereqs_filename,std::ios::in};
2023-02-04 20:06:20 +00:00
std::cerr << "Processing term " << term << "..." << std::endl;
2023-02-07 14:42:37 +00:00
quatalog_data.list_of_terms["all_terms"].append(term);
2023-02-04 17:38:37 +00:00
2023-02-05 03:06:28 +00:00
term_data_t term_data;
courses_file >> term_data.courses;
prereqs_file >> term_data.prerequisites;
2023-02-04 22:29:50 +00:00
course_handler_t* course_handler;
if(term.substr(4,2) == "05") {
2023-02-07 14:42:37 +00:00
quatalog_data.list_of_terms["all_terms"].append(term+"02");
quatalog_data.list_of_terms["all_terms"].append(term+"03");
2023-02-04 22:29:50 +00:00
course_handler = handle_course_summer;
} else {
course_handler = handle_course;
}
2023-02-06 19:54:54 +00:00
for(const auto& prefix : term_data.courses) {
2023-02-05 03:06:28 +00:00
handle_prefix(prefix,term,quatalog_data,term_data,course_handler);
2023-02-04 20:06:20 +00:00
}
2023-02-05 02:09:05 +00:00
courses_file.close();
prereqs_file.close();
2023-02-04 20:06:20 +00:00
}
2023-02-04 22:29:50 +00:00
void handle_prefix(const Json::Value& prefix,
const std::string& term,
2023-02-05 03:06:28 +00:00
quatalog_data_t& quatalog_data,
const term_data_t& term_data,
2023-02-04 22:29:50 +00:00
course_handler_t course_handler) {
2023-02-06 19:54:54 +00:00
for(const auto& course : prefix["courses"]) {
2023-02-05 03:06:28 +00:00
course_handler(course,term,quatalog_data,term_data.prerequisites);
2023-02-04 17:38:37 +00:00
}
}
2023-02-04 22:29:50 +00:00
void handle_course(const Json::Value& course,
const std::string& term,
2023-02-05 02:09:05 +00:00
quatalog_data_t& data,
2023-02-05 03:06:28 +00:00
const Json::Value& term_prereqs) {
2023-02-09 19:55:35 +00:00
std::string course_id = course["id"].asString();
auto& course_terms = data.terms_offered[course_id];
2023-02-04 22:29:50 +00:00
const Json::Value& sections = course["sections"];
2023-02-07 23:10:56 +00:00
handle_everything(sections,course,term,course_terms,data.prerequisites,term_prereqs);
2023-02-04 20:06:20 +00:00
}
2023-02-04 22:29:50 +00:00
void handle_course_summer(const Json::Value& course,
const std::string& term,
2023-02-05 02:09:05 +00:00
quatalog_data_t& data,
2023-02-05 03:06:28 +00:00
const Json::Value& term_prereqs) {
2023-02-04 22:29:50 +00:00
// sections[0]: Full term sections
// sections[1]: First-half term sections
// sections[2]: Second-half term sections
// Of course, a course will never be offered
// in both the full term _and_ one of the
// half-terms, but there are a few that
// are offered in both halves (e.g. STSO-4100)
std::array<Json::Value,3> sections;
// We will loop twice over the sections,
// once here and once in handle_sections,
// but frankly I tried to make it all in 1
// loop and the code was a total unreadable
// mess. So I don't really care
int subterm;
2023-02-09 19:55:35 +00:00
const auto& course_id = course["id"].asString();
2023-02-04 22:29:50 +00:00
bool subterm0 = false, subterm1 = false, subterm2 = false;
for(const auto& section : course["sections"]) {
const auto& timeslot = section["timeslots"][0];
2023-02-06 19:54:54 +00:00
const auto& dateEnd = timeslot["dateEnd"].asString();
const auto& dateStart = timeslot["dateStart"].asString();
2023-02-04 22:29:50 +00:00
subterm = 0;
if(dateStart.substr(0,2) != "05") {
2023-02-09 22:00:03 +00:00
subterm = 2;
2023-02-10 04:51:51 +00:00
subterm2 = true;
2023-02-04 22:29:50 +00:00
} else if(dateEnd.substr(0,2) != "08") {
2023-02-09 22:00:03 +00:00
subterm = 1;
2023-02-10 04:51:51 +00:00
subterm1 = true;
2023-02-04 22:29:50 +00:00
} else {
subterm0 = true;
}
sections[subterm].append(section);
}
2023-02-09 19:55:35 +00:00
auto& course_terms = data.terms_offered[course_id];
2023-02-04 22:29:50 +00:00
if(subterm0) {
handle_everything(sections[0],
course,
2023-02-07 23:10:56 +00:00
term,
course_terms,
data.prerequisites,
term_prereqs);
return;
}
if(subterm1) {
handle_everything(sections[1],
course,
2023-02-07 23:10:56 +00:00
term+"02",
course_terms,
data.prerequisites,
term_prereqs);
}
if(subterm2) {
handle_everything(sections[2],
course,
2023-02-07 23:10:56 +00:00
term+"03",
course_terms,
data.prerequisites,
term_prereqs);
2023-02-04 22:29:50 +00:00
}
}
2023-02-05 02:09:05 +00:00
void handle_everything(const Json::Value& sections,
2023-02-05 03:06:28 +00:00
const Json::Value& course,
2023-02-07 23:10:56 +00:00
const std::string& term,
Json::Value& course_terms,
Json::Value& out_prereqs,
2023-02-05 03:06:28 +00:00
const Json::Value& term_prereqs) {
2023-02-07 23:10:56 +00:00
Json::Value& course_term = course_terms[term];
const auto& course_id = course["id"].asString();
2024-03-16 03:46:28 +00:00
// course_term["title"] = course["title"];
std::set<std::string> titles;
for(const auto& sec : course["sections"]) {
titles.insert(sec["title"].asString());
}
course_term["title"] = Json::arrayValue;
for(const auto& title : titles) {
course_term["title"].append(title);
}
2023-02-05 02:09:05 +00:00
handle_sections(sections,course_term);
2023-02-07 23:10:56 +00:00
course_terms["latest_term"] = term;
handle_attributes(sections[0],course_id,course_term,out_prereqs);
handle_prereqs(sections[0],course_id,out_prereqs,term_prereqs);
2023-02-05 02:09:05 +00:00
}
2023-02-04 22:29:50 +00:00
void handle_sections(const Json::Value& sections,
Json::Value& course_term) {
2023-02-04 20:06:20 +00:00
int credMin = Json::Value::maxInt, credMax = 0;
int seatsTaken = 0, capacity = 0, remaining = 0;
2023-02-05 00:32:25 +00:00
std::unordered_set<std::string> instructors;
2023-02-04 22:29:50 +00:00
for(const auto& section : sections) {
2023-02-04 20:06:20 +00:00
// Get min/max credits *of all sections*
// (RCOS looking at you)
credMin = std::min(credMin,section["credMin"].asInt());
credMax = std::max(credMax,section["credMax"].asInt());
// Add seating data of all sections together.
// remaining might get clobbered by some sections
// having negative seats, but this probably won't
// be too much of an issue
seatsTaken += section["act"].asInt();
capacity += section["cap"].asInt();
remaining += section["rem"].asInt();
2023-02-05 00:32:25 +00:00
handle_instructors(section,instructors);
2023-02-04 20:06:20 +00:00
}
course_term["credits"]["min"] = credMin;
course_term["credits"]["max"] = credMax;
course_term["seats"]["taken"] = seatsTaken;
course_term["seats"]["capacity"] = capacity;
course_term["seats"]["remaining"] = remaining;
2023-02-05 00:32:25 +00:00
for(const auto& instructor : instructors) {
course_term["instructors"].append(instructor);
}
}
void handle_instructors(const Json::Value& section,
std::unordered_set<std::string>& instructors) {
for(const auto& timeslot : section["timeslots"]) {
handle_multiple_instructors(timeslot["instructor"].asString(),instructors);
}
}
void handle_multiple_instructors(const std::string& instructor_str,
std::unordered_set<std::string>& instructors) {
iterate_on_delimited_string(instructor_str,
2023-02-06 19:54:54 +00:00
std::regex(", ?"),
[&](const std::string& inst_str) {
if(inst_str == "TBA") return;
instructors.insert(inst_str);
});
2023-02-04 20:06:20 +00:00
}
2023-02-04 22:29:50 +00:00
void handle_attributes(const Json::Value& section,
const std::string& course_id,
Json::Value& course_term,
Json::Value& out_prereqs) {
2023-02-04 20:06:20 +00:00
// Makes the JSON list of attributes
Json::Value& term_attributes = course_term["attributes"];
Json::Value attributes = Json::arrayValue;
term_attributes = Json::arrayValue;
2023-02-06 19:54:54 +00:00
iterate_on_delimited_string(section["attribute"].asString(),
2023-02-06 19:54:54 +00:00
std::regex(" and |, "),
[&](const std::string& attr_str) {
handle_attribute(attr_str,
attributes,
term_attributes);
2023-02-06 19:54:54 +00:00
});
out_prereqs[course_id]["attributes"] = attributes;
2023-02-04 17:38:37 +00:00
}
2023-02-04 22:29:50 +00:00
void handle_attribute(const std::string& attribute,
Json::Value& attributes,
Json::Value& term_attributes) {
// COVID year screwed these attributes up; we will ignore them
if(attribute != "Hybrid:Online/In-Person Course"
&& attribute != "Online Course"
&& attribute != "In-Person Course") {
attributes.append(attribute);
handle_term_attribute(attribute,term_attributes);
}
}
void handle_term_attribute(const std::string& attribute,
Json::Value& attributes) {
2023-02-17 02:49:17 +00:00
const auto& attr_short_itr = attr_to_short_attr.find(attribute);
if(attr_short_itr != attr_to_short_attr.end()) {
attributes.append(attr_short_itr->second);
2023-02-04 17:38:37 +00:00
}
}
2023-02-05 03:06:28 +00:00
2023-02-06 19:54:54 +00:00
template<typename Functor>
void iterate_on_delimited_string(const std::string& str,
2023-02-06 19:54:54 +00:00
const std::regex& delim,
const Functor& callback) {
// This mess is basically C++'s string split but not using
// as much memory as an actual string split
const auto end_itr = std::sregex_token_iterator();
auto itr = std::sregex_token_iterator(str.begin(),str.end(),delim,-1);
while(itr != end_itr && !itr->str().empty()) {
callback(itr->str());
itr++;
}
}
2023-02-05 03:06:28 +00:00
void handle_prereqs(const Json::Value& section,
const std::string& course_id,
Json::Value& out_data,
2023-02-05 03:06:28 +00:00
const Json::Value& term_prereqs) {
2023-02-06 19:54:54 +00:00
const auto& crn = section["crn"].asString();
const auto& in_obj = term_prereqs[crn];
const auto& corequisites = in_obj["corequisites"];
const auto& prerequisites = in_obj["prerequisites"];
const auto& cross_listings = in_obj["cross_list_courses"];
2023-02-07 23:10:56 +00:00
out_data[course_id]["corequisites"] = corequisites;
out_data[course_id]["prerequisites"] = prerequisites;
out_data[course_id]["cross_listings"] = cross_listings;
2023-02-05 03:06:28 +00:00
}